You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/10/26 02:37:06 UTC
[1/7] tika git commit: TIKA-1343 Create a Tika Translator
implementation that uses JoshuaDecoder
Repository: tika
Updated Branches:
refs/heads/master 7ca105ef5 -> dadbf55c5
TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d4fb28f9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d4fb28f9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d4fb28f9
Branch: refs/heads/master
Commit: d4fb28f91d77458b15557942438f874b9f564e88
Parents: 19ed261
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Wed Apr 27 15:06:42 2016 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Wed Apr 27 15:06:42 2016 -0700
----------------------------------------------------------------------
.../tika/language/detect/LanguageResult.java | 6 +-
.../tika/language/translate/Translator.java | 20 +-
tika-parsers/pom.xml | 14 ++
.../language/translate/AbstractTranslator.java | 18 +-
.../language/translate/GoogleTranslator.java | 5 -
.../translate/JoshuaNetworkTranslator.java | 189 +++++++++++++++++++
.../language/translate/MosesTranslator.java | 2 +
.../translate/translator.google.properties | 6 +-
.../translate/translator.joshua.properties | 22 +++
.../translate/translator.lingo24.properties | 5 -
.../translate/translator.moses.properties | 5 -
.../translate/translator.yandex.properties | 15 ++
.../translate/JoshuaNetworkTranslatorTest.java | 41 ++++
.../translate/MicrosoftTranslatorTest.java | 1 -
.../translate/YandexTranslatorTest.java | 2 -
15 files changed, 309 insertions(+), 42 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
index aaa503b..63e1f8c 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -33,7 +33,7 @@ public class LanguageResult {
/**
*
- * @param language ISO 639-1 language code (plus optional "-<country code>")
+ * @param language ISO 639-1 language code (plus optional country code)
* @param rawScore confidence of detector in the result.
*/
public LanguageResult(String language, LanguageConfidence confidence, float rawScore) {
@@ -42,6 +42,10 @@ public class LanguageResult {
this.rawScore = rawScore;
}
+ /**
+ * The ISO 639-1 language code (plus optional country code)
+ * @return a string representation of the language code
+ */
public String getLanguage() {
return language;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
index f225565..912e30f 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
@@ -26,14 +26,7 @@ import java.io.IOException;
*/
public interface Translator {
/**
- * Translate text between given languages. The following languages are supported:
- * Arabic("ar"), Bulgarian("bg"), Catalan("ca"), Chinese-Simplified("zh-CHS"), Chinese-Traditional("zh-CHT"),
- * Czech("cs"), Danish("da"), Dutch("nl"), English("en"), Estonian("et"), Innish("fi"), French("fr"), German("de"),
- * Greek("el"), Haitian-Creole("ht"), Hebrew("he"), Hindi("hi"), Hmong-Daw("mww"), Hungarian("hu"),
- * Indonesian("id"), Italian("it"), Japanese("ja"), Korean("ko"), Latvian("lv"), Lithuanian("lt"), Malay("ms"),
- * Norwegian("no"), Persian("fa"), Polish("pl"), Portuguese("pt"), Romanian("ro"), Russian("ru"), Slovak("sk"),
- * Slovenian("sl"), Spanish("es"), Swedish("sv"), Thai("th"), Turkish("tr"), Ukranian("uk"), Urdu("ur"),
- * Vietnemese("vi").
+ * Translate text between given languages.
* @param text The text to translate.
* @param sourceLanguage The input text language (for example, "en").
* @param targetLanguage The desired language to translate to (for example, "fr").
@@ -45,15 +38,8 @@ public interface Translator {
public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException;
/**
- * Translate text to the given language. This method attempts to auto-detect the source language of the text.
- * The following languages are supported:
- * Arabic("ar"), Bulgarian("bg"), Catalan("ca"), Chinese-Simplified("zh-CHS"), Chinese-Traditional("zh-CHT"),
- * Czech("cs"), Danish("da"), Dutch("nl"), English("en"), Estonian("et"), Innish("fi"), French("fr"), German("de"),
- * Greek("el"), Haitian-Creole("ht"), Hebrew("he"), Hindi("hi"), Hmong-Daw("mww"), Hungarian("hu"),
- * Indonesian("id"), Italian("it"), Japanese("ja"), Korean("ko"), Latvian("lv"), Lithuanian("lt"), Malay("ms"),
- * Norwegian("no"), Persian("fa"), Polish("pl"), Portuguese("pt"), Romanian("ro"), Russian("ru"), Slovak("sk"),
- * Slovenian("sl"), Spanish("es"), Swedish("sv"), Thai("th"), Turkish("tr"), Ukranian("uk"), Urdu("ur"),
- * Vietnemese("vi").
+ * Translate text to the given language
+ * This method attempts to auto-detect the source language of the text.
* @param text The text to translate.
* @param targetLanguage The desired language to translate to (for example, "hi").
* @return The translation result. If translation is unavailable, returns the same text back.
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 64edbeb..fa3b7fc 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -471,6 +471,19 @@
<execute />
</action>
</pluginExecution>
+ <pluginExecution>
+ <pluginExecutionFilter>
+ <groupId>org.codehaus.gmaven</groupId>
+ <artifactId>groovy-maven-plugin</artifactId>
+ <versionRange>[2.0,)</versionRange>
+ <goals>
+ <goal>execute</goal>
+ </goals>
+ </pluginExecutionFilter>
+ <action>
+ <ignore></ignore>
+ </action>
+ </pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
@@ -506,6 +519,7 @@
<plugin>
<groupId>org.codehaus.gmaven</groupId>
<artifactId>groovy-maven-plugin</artifactId>
+ <version>2.0</version>
<dependencies>
<dependency>
<groupId>org.apache.maven</groupId>
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
index d892ab9..2a331bb 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.language.translate;
import java.io.IOException;
@@ -9,7 +25,7 @@ import org.apache.tika.language.detect.LanguageResult;
public abstract class AbstractTranslator implements Translator {
- protected LanguageResult detectLanguage(String text) throws IOException {
+ protected LanguageResult detectLanguage(String text) throws IOException {
LanguageDetector detector = new OptimaizeLangDetector().loadModels();
return detector.detect(text);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
index 29c03c6..cdab2ad 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
@@ -24,8 +24,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;
-import java.util.logging.Logger;
-
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
@@ -51,9 +49,6 @@ public class GoogleTranslator extends AbstractTranslator {
private static final String DEFAULT_KEY = "dummy-secret";
- private static final Logger LOG = Logger.getLogger(GoogleTranslator.class
- .getName());
-
private WebClient client;
private String apiKey;
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
new file mode 100644
index 0000000..e97389c
--- /dev/null
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.translate;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Properties;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.exception.TikaException;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * <p>This translator is designed to work with a TCP-IP available
+ * Joshua translation server, specifically the
+ * <a href="https://github.com/joshua-decoder/joshua_translation_engine">
+ * REST-based Joshua server</a>.</p>
+ *
+ * <p>If you were to interact with the server via curl a request
+ * would look as follows</p>
+ *
+ * <pre>
+ * {code
+ * curl http://localhost:5000/joshua/translate/english \
+ * -i -H "Content-Type: application/json" \
+ * -X POST -d '{"inputLanguage": "Spanish", "inputText": "vuelo"}' -v
+ * }
+ * </pre>
+ *
+ * Joshua requires input to be pre-formatted into sentences, one per line,
+ * so this translation implementation takes care of that.
+ */
+public class JoshuaNetworkTranslator extends AbstractTranslator {
+
+ private static final String PROPERTIES_FILE = "translator.joshua.properties";
+
+ private String JOSHUA_SERVER = "joshua.server.url";
+
+ private String networkServer;
+
+ private WebClient client;
+
+ /**
+ * Default constructor which first checks for the presence of
+ * the <code>translator.joshua.properties</code> file.
+ * We check if the remote server is available on each
+ * translation process. This check is not a remote call, but instead
+ * a check for null value within of a local variable represetning the
+ * value for <code>joshua.server.url</code>, which should be populated
+ * within the <code>translator.joshua.properties</code> file.
+ */
+ public JoshuaNetworkTranslator() {
+ Properties props = new Properties();
+ InputStream stream;
+ stream = JoshuaNetworkTranslator.class.getResourceAsStream(PROPERTIES_FILE);
+ try {
+ if(stream != null) {
+ props.load(stream);
+ networkServer = props.getProperty(JOSHUA_SERVER);
+ }
+ } catch (IOException e) {
+ // Error with properties file. Translation will not work.
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * <p>Initially then check if the source language has been provided.
+ * If no source language (or a null value) has been provided then
+ * we make an attempt to guess the source using Tika's
+ * {@link org.apache.tika.langdetect.OptimaizeLangDetector}. If we
+ * are still unable to guess the language then we return the source
+ * text.</p>
+ *
+ * <p>We then process the input text into a new string consisting of
+ * sentences, one per line e.g. insert \n between the presence of '.'</p>
+ *
+ * @see org.apache.tika.language.translate.Translator#translate
+ * (java.lang.String, java.lang.String, java.lang.String)
+ */
+ @Override
+ public String translate(String text, String sourceLanguage,
+ String targetLanguage) throws TikaException, IOException {
+ if (!this.isAvailable())
+ return text;
+
+ //make an attempt to guess language if one is not provided.
+ if (sourceLanguage == null)
+ sourceLanguage = detectLanguage(text).getLanguage();
+
+ //process input text into sentences, one per line
+ // e.g. insert \n between the presence of '.'
+ StringBuilder sb = new StringBuilder(text);
+ int i = 0;
+ while ((i = sb.indexOf(".", i + 1)) != -1) {
+ sb.replace(i, i + 1, "\n");
+ }
+
+ text = sb.toString();
+
+ //create client
+ if (!networkServer.endsWith("/")) {
+ client = WebClient.create(networkServer + "/" + targetLanguage + "/");
+ } else {
+ client = WebClient.create(networkServer + targetLanguage + "/");
+ }
+
+ //make the reuest
+ Response response = client.accept(MediaType.APPLICATION_JSON)
+ .query("inputLanguage", sourceLanguage)
+ .query("inputText", text).get();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ (InputStream) response.getEntity(), UTF_8));
+ String line = null;
+ StringBuffer responseText = new StringBuffer();
+ while ((line = reader.readLine()) != null) {
+ responseText.append(line);
+ }
+
+ try {
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode jsonResp = mapper.readTree(responseText.toString());
+
+ if (!jsonResp.findValuesAsText("code").isEmpty()) {
+ String code = jsonResp.findValuesAsText("code").get(0);
+ if (code.equals("200")) {
+ return jsonResp.findValue("text").get(0).asText();
+ } else {
+ throw new TikaException(jsonResp.findValue("message").get(0).asText());
+ }
+ } else {
+ throw new TikaException("Return message not recognized: " +
+ responseText.toString().substring(0, Math.min(responseText.length(), 100)));
+ }
+ } catch (JsonParseException e) {
+ throw new TikaException("Error requesting translation from '" +
+ sourceLanguage + "' to '" + targetLanguage + "', JSON response "
+ + "from Joshua REST Server is not well formatted: " + responseText.toString());
+ }
+ }
+
+ /**
+ * Make an attempt to guess the source language via
+ * {@link org.apache.tika.language.translate.AbstractTranslator#detectLanguage(String)}
+ * before making the call to
+ * {@link org.apache.tika.language.translate.JoshuaNetworkTranslator#translate(String, String, String)}
+ * @see org.apache.tika.language.translate.Translator#translate(java.lang.String, java.lang.String)
+ */
+ @Override
+ public String translate(String text, String targetLanguage)
+ throws TikaException, IOException {
+ if (isAvailable())
+ return text;
+ String sourceLanguage = detectLanguage(text).getLanguage();
+ return translate(text, sourceLanguage, targetLanguage);
+ }
+
+ /**
+ * @see org.apache.tika.language.translate.Translator#isAvailable()
+ */
+ @Override
+ public boolean isAvailable() {
+ return this.networkServer!=null;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
index 8a976fe..fb9c743 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
@@ -76,6 +76,7 @@ public class MosesTranslator extends ExternalTranslator {
public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException {
if (!isAvailable() || !checkCommand(buildCheckCommand(smtPath), 1)) return text;
File tmpFile = new File(TMP_FILE_NAME);
+ @SuppressWarnings("resource")
OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(tmpFile), Charset.defaultCharset());
out.append(text).append('\n').close();
@@ -84,6 +85,7 @@ public class MosesTranslator extends ExternalTranslator {
File tmpTranslatedFile = new File(TMP_FILE_NAME + ".translated");
StringBuilder stringBuilder = new StringBuilder();
+ @SuppressWarnings("resource")
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(tmpTranslatedFile),
Charset.defaultCharset()
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
index edbc732..4e622ce 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
@@ -12,11 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
+
#
# To use the Google translation service, you <em>must</em> set your API-key
# as described in GoogleTranslator. If you do not want translation
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
new file mode 100644
index 0000000..81071f3
--- /dev/null
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# The property below is partially described within
+# https://github.com/joshua-decoder/joshua_translation_engine#requesting-translations
+# if left as null, then translation will not occur and the source text
+# will be returned.
+# An example would be http://localhost:5000/joshua/translate/
+joshua.server=
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
index 04e0883..24756ac 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
@@ -12,11 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
#
# To use the Lingo24 translation service, you <em>must</em> set your API-key
# as described in Lingo24Translator. If you do not want translation
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
index 72f2d20..55f9176 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
@@ -12,11 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
# smt_path is the full path to the Moses jar to run.
# script_path is the full path to the script to pass to the smt jar.
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
index 57c11e0..602445e 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# To use the YANDEX translate service, you <em>must</em> set your API-Key
# as described in Translate API, https://tech.yandex.com/translate/
# If you do not want translation please set the value to "dummy-key".
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
new file mode 100644
index 0000000..2cf7b3a
--- /dev/null
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.translate;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+public class JoshuaNetworkTranslatorTest {
+
+ JoshuaNetworkTranslator translator;
+
+ @Before
+ public void setUp() {
+ translator = new JoshuaNetworkTranslator();
+ }
+
+ @Test
+ public void testSimpleSpanishToEnglishTranslation() throws Exception {
+ String source = "hola";
+ String expected = "hello";
+ String translated = translator.translate(source, "es", "en");
+ if (translator.isAvailable()) assertTrue("Translate " + source + " to " + expected + " (was " + translated + ")",
+ expected.equalsIgnoreCase(translated));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
index a35281d..45d246e 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.language.translate;
-import org.apache.tika.Tika;
import org.junit.Before;
import org.junit.Test;
http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
index adac4be..2c5d969 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
@@ -1,4 +1,3 @@
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -18,7 +17,6 @@
package org.apache.tika.language.translate;
-import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
[2/7] tika git commit: Merge branch 'master' into TIKA-1343
Posted by le...@apache.org.
Merge branch 'master' into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4aff4839
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4aff4839
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4aff4839
Branch: refs/heads/master
Commit: 4aff4839aece41a739b93169cf7a475ecfc5c70c
Parents: d4fb28f c93ff3e
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu May 5 14:03:01 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu May 5 14:03:01 2016 -0700
----------------------------------------------------------------------
.../org/apache/tika/mime/tika-mimetypes.xml | 8 +
.../tika/parser/code/SourceCodeParser.java | 142 ++++++++--------
.../apache/tika/parser/image/ICNSParser.java | 117 +++++++++++++
.../org/apache/tika/parser/image/ICNSType.java | 170 +++++++++++++++++++
.../parser/mp4/DirectFileReadDataSource.java | 2 +-
.../org/apache/tika/parser/mp4/MP4Parser.java | 13 +-
.../services/org.apache.tika.parser.Parser | 3 +-
.../org/apache/tika/mime/TestMimeTypes.java | 8 +
.../tika/parser/image/ICNSParserTest.java | 65 +++++++
.../test/resources/test-documents/testICNS.icns | Bin 0 -> 2472 bytes
.../test-documents/testICNS_basic.icns | Bin 0 -> 18199 bytes
.../resources/test-documents/testKeynoteNew.key | Bin 0 -> 274397 bytes
.../test-documents/testNumbersNew.numbers | Bin 0 -> 179147 bytes
.../resources/test-documents/testPagesNew.pages | Bin 0 -> 237567 bytes
14 files changed, 447 insertions(+), 81 deletions(-)
----------------------------------------------------------------------
[4/7] tika git commit: Improve logging and trivial code conventions
Posted by le...@apache.org.
Improve logging and trivial code conventions
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a1250ff3
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a1250ff3
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a1250ff3
Branch: refs/heads/master
Commit: a1250ff33c68065e4a812285dfa6a6bd2a6a22de
Parents: fe559b8
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:05:35 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:05:35 2016 -0700
----------------------------------------------------------------------
.../translate/JoshuaNetworkTranslator.java | 18 +++++++++++-------
.../translate/translator.joshua.properties | 2 +-
2 files changed, 12 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/a1250ff3/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
index e97389c..8e1f768 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -29,6 +29,9 @@ import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.exception.TikaException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -54,10 +57,12 @@ import com.fasterxml.jackson.databind.ObjectMapper;
* so this translation implementation takes care of that.
*/
public class JoshuaNetworkTranslator extends AbstractTranslator {
+
+ private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
private static final String PROPERTIES_FILE = "translator.joshua.properties";
- private String JOSHUA_SERVER = "joshua.server.url";
+ private static final String JOSHUA_SERVER = "joshua.server.url";
private String networkServer;
@@ -82,8 +87,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
networkServer = props.getProperty(JOSHUA_SERVER);
}
} catch (IOException e) {
- // Error with properties file. Translation will not work.
- e.printStackTrace();
+ LOG.error("An error occured whilst reading translator.joshua.properties file", e);
}
}
@@ -119,7 +123,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
sb.replace(i, i + 1, "\n");
}
- text = sb.toString();
+ String inputText = sb.toString();
//create client
if (!networkServer.endsWith("/")) {
@@ -131,10 +135,10 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
//make the reuest
Response response = client.accept(MediaType.APPLICATION_JSON)
.query("inputLanguage", sourceLanguage)
- .query("inputText", text).get();
+ .query("inputText", inputText).get();
BufferedReader reader = new BufferedReader(new InputStreamReader(
(InputStream) response.getEntity(), UTF_8));
- String line = null;
+ String line;
StringBuffer responseText = new StringBuffer();
while ((line = reader.readLine()) != null) {
responseText.append(line);
@@ -146,7 +150,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
if (!jsonResp.findValuesAsText("code").isEmpty()) {
String code = jsonResp.findValuesAsText("code").get(0);
- if (code.equals("200")) {
+ if ("200".equals(code)) {
return jsonResp.findValue("text").get(0).asText();
} else {
throw new TikaException(jsonResp.findValue("message").get(0).asText());
http://git-wip-us.apache.org/repos/asf/tika/blob/a1250ff3/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
index 81071f3..4894f48 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -19,4 +19,4 @@
# if left as null, then translation will not occur and the source text
# will be returned.
# An example would be http://localhost:5000/joshua/translate/
-joshua.server=
\ No newline at end of file
+joshua.server=http://localhost:5000/joshua/translate/
\ No newline at end of file
[7/7] tika git commit: TIKA-1343 Create a Tika Translator
implementation that uses JoshuaDecoder
Posted by le...@apache.org.
TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dadbf55c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dadbf55c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dadbf55c
Branch: refs/heads/master
Commit: dadbf55c51d166846aa0d365fd2ed340b604bfae
Parents: 5657ae6
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Oct 24 22:20:04 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Oct 24 22:20:04 2016 -0700
----------------------------------------------------------------------
.../translate/JoshuaNetworkTranslator.java | 44 ++++++++++----------
...rg.apache.tika.language.translate.Translator | 3 +-
.../translate/translator.joshua.properties | 2 +-
.../translate/JoshuaNetworkTranslatorTest.java | 2 +-
4 files changed, 27 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
index 8e1f768..8cf0adf 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -22,6 +22,8 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Properties;
import javax.ws.rs.core.MediaType;
@@ -35,6 +37,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;
/**
* <p>This translator is designed to work with a TCP-IP available
@@ -57,7 +61,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
* so this translation implementation takes care of that.
*/
public class JoshuaNetworkTranslator extends AbstractTranslator {
-
+
private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
private static final String PROPERTIES_FILE = "translator.joshua.properties";
@@ -65,8 +69,6 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
private static final String JOSHUA_SERVER = "joshua.server.url";
private String networkServer;
-
- private WebClient client;
/**
* Default constructor which first checks for the presence of
@@ -124,40 +126,40 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
}
String inputText = sb.toString();
+ WebClient client;
+ final List<Object> providers = new ArrayList<>();
+ JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider();
+ providers.add(jacksonJsonProvider);
//create client
if (!networkServer.endsWith("/")) {
- client = WebClient.create(networkServer + "/" + targetLanguage + "/");
+ client = WebClient.create(networkServer + "/" + targetLanguage, providers);
} else {
- client = WebClient.create(networkServer + targetLanguage + "/");
+ client = WebClient.create(networkServer + targetLanguage, providers);
}
+ ObjectMapper requestMapper = new ObjectMapper();
+ ObjectNode jsonNode = requestMapper.createObjectNode();
+ jsonNode.put("inputLanguage", sourceLanguage);
+ jsonNode.put("inputText", inputText);
//make the reuest
- Response response = client.accept(MediaType.APPLICATION_JSON)
- .query("inputLanguage", sourceLanguage)
- .query("inputText", inputText).get();
+ Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON).post(jsonNode);
BufferedReader reader = new BufferedReader(new InputStreamReader(
(InputStream) response.getEntity(), UTF_8));
String line;
- StringBuffer responseText = new StringBuffer();
+ StringBuilder responseText = new StringBuilder();
while ((line = reader.readLine()) != null) {
responseText.append(line);
}
try {
- ObjectMapper mapper = new ObjectMapper();
- JsonNode jsonResp = mapper.readTree(responseText.toString());
-
- if (!jsonResp.findValuesAsText("code").isEmpty()) {
- String code = jsonResp.findValuesAsText("code").get(0);
- if ("200".equals(code)) {
- return jsonResp.findValue("text").get(0).asText();
- } else {
- throw new TikaException(jsonResp.findValue("message").get(0).asText());
- }
+ ObjectMapper responseMapper = new ObjectMapper();
+ JsonNode jsonResp = responseMapper.readTree(responseText.toString());
+
+ if (jsonResp.findValuesAsText("outputText") != null) {
+ return jsonResp.findValuesAsText("outputText").get(0);
} else {
- throw new TikaException("Return message not recognized: " +
- responseText.toString().substring(0, Math.min(responseText.length(), 100)));
+ throw new TikaException(jsonResp.findValue("message").get(0).asText());
}
} catch (JsonParseException e) {
throw new TikaException("Error requesting translation from '" +
http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
index 773daf3..f3dcad4 100644
--- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
+++ b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
@@ -16,4 +16,5 @@
org.apache.tika.language.translate.MicrosoftTranslator
org.apache.tika.language.translate.GoogleTranslator
org.apache.tika.language.translate.Lingo24Translator
-org.apache.tika.language.translate.CachedTranslator
\ No newline at end of file
+org.apache.tika.language.translate.CachedTranslator
+org.apache.tika.language.translate.JoshuaNetworkTranslator
http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
index 4894f48..53bd773 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -19,4 +19,4 @@
# if left as null, then translation will not occur and the source text
# will be returned.
# An example would be http://localhost:5000/joshua/translate/
-joshua.server=http://localhost:5000/joshua/translate/
\ No newline at end of file
+joshua.server.url=http://localhost:5000/joshua/translate/
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
index 2cf7b3a..4413926 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
@@ -34,7 +34,7 @@ public class JoshuaNetworkTranslatorTest {
public void testSimpleSpanishToEnglishTranslation() throws Exception {
String source = "hola";
String expected = "hello";
- String translated = translator.translate(source, "es", "en");
+ String translated = translator.translate(source, "spanish", "english");
if (translator.isAvailable()) assertTrue("Translate " + source + " to " + expected + " (was " + translated + ")",
expected.equalsIgnoreCase(translated));
}
[6/7] tika git commit: Merge branch 'TIKA-1343' of
https://github.com/lewismc/tika into TIKA-1343
Posted by le...@apache.org.
Merge branch 'TIKA-1343' of https://github.com/lewismc/tika into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5657ae66
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5657ae66
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5657ae66
Branch: refs/heads/master
Commit: 5657ae6616cd461a19676952f40082b2ec291dac
Parents: 7ca105e d50a693
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Oct 24 10:49:26 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Oct 24 10:49:26 2016 -0700
----------------------------------------------------------------------
.../tika/language/detect/LanguageResult.java | 6 +-
.../tika/language/translate/Translator.java | 20 +-
tika-parsers/pom.xml | 14 ++
.../language/translate/AbstractTranslator.java | 2 +-
.../language/translate/GoogleTranslator.java | 5 -
.../translate/JoshuaNetworkTranslator.java | 193 +++++++++++++++++++
.../language/translate/MosesTranslator.java | 2 +
.../translate/translator.google.properties | 6 +-
.../translate/translator.joshua.properties | 22 +++
.../translate/translator.lingo24.properties | 5 -
.../translate/translator.moses.properties | 5 -
.../translate/JoshuaNetworkTranslatorTest.java | 41 ++++
.../translate/MicrosoftTranslatorTest.java | 1 -
.../translate/YandexTranslatorTest.java | 2 -
14 files changed, 282 insertions(+), 42 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/5657ae66/tika-parsers/pom.xml
----------------------------------------------------------------------
[3/7] tika git commit: Merge master into TIKA-1343
Posted by le...@apache.org.
Merge master into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/fe559b80
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/fe559b80
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/fe559b80
Branch: refs/heads/master
Commit: fe559b80bcad1f107904ca7a89724a26ea2921a1
Parents: 4aff483 23a11ef
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Jul 1 13:35:52 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Jul 1 13:35:52 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 38 +-
pom.xml | 2 +-
tika-app/pom.xml | 2 +-
.../main/java/org/apache/tika/cli/TikaCLI.java | 9 +-
tika-batch/pom.xml | 2 +-
tika-bundle/pom.xml | 2 +-
tika-core/pom.xml | 2 +-
.../org/apache/tika/detect/NameDetector.java | 15 +-
.../tika/detect/ZeroSizeFileDetector.java | 45 +
.../java/org/apache/tika/io/EndianUtils.java | 829 +++---
.../tika/metadata/TikaCoreProperties.java | 7 +
.../java/org/apache/tika/mime/MediaType.java | 3 +
.../org/apache/tika/mime/MediaTypeRegistry.java | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 69 +-
.../java/org/apache/tika/TikaDetectionTest.java | 2 +-
.../src/test/java/org/apache/tika/TikaTest.java | 6 +-
.../apache/tika/detect/NameDetectorTest.java | 10 +
.../tika/detect/ZeroSizeFileDetectorTest.java | 64 +
.../org/apache/tika/io/EndianUtilsTest.java | 35 +
tika-example/pom.xml | 2 +-
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 3 +-
...apache.tika.language.detect.LanguageDetector | 15 +
tika-parent/pom.xml | 4 +-
tika-parsers/pom.xml | 4 +-
.../parser/apple/AppleSingleFileParser.java | 205 ++
.../org/apache/tika/parser/dbf/DBFCell.java | 147 +
.../apache/tika/parser/dbf/DBFColumnHeader.java | 97 +
.../apache/tika/parser/dbf/DBFFileHeader.java | 144 +
.../org/apache/tika/parser/dbf/DBFParser.java | 155 ++
.../org/apache/tika/parser/dbf/DBFReader.java | 207 ++
.../java/org/apache/tika/parser/dbf/DBFRow.java | 62 +
.../apache/tika/parser/geo/topic/GeoParser.java | 14 +-
.../tika/parser/html/HtmlEncodingDetector.java | 16 +-
.../apache/tika/parser/html/HtmlHandler.java | 3 +
.../tika/parser/image/xmp/JempboxExtractor.java | 30 +
.../iwork/iwana/IWork13PackageParser.java | 86 +
.../tika/parser/mail/MailContentHandler.java | 110 +-
.../microsoft/AbstractPOIFSExtractor.java | 32 +-
.../tika/parser/microsoft/HSLFExtractor.java | 32 +-
.../parser/microsoft/JackcessExtractor.java | 4 +-
.../parser/microsoft/MSOwnerFileParser.java | 81 +
.../tika/parser/microsoft/OfficeParser.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 22 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 58 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 99 +-
.../microsoft/xml/AbstractXML2003Parser.java | 128 +
.../parser/microsoft/xml/HyperlinkHandler.java | 96 +
.../microsoft/xml/SpreadsheetMLParser.java | 175 ++
.../tika/parser/microsoft/xml/WordMLParser.java | 306 +++
.../parser/ner/grobid/GrobidNERecogniser.java | 28 +-
.../tika/parser/ocr/TesseractOCRParser.java | 87 +-
.../tika/parser/pdf/AbstractPDF2XHTML.java | 578 ++++
.../org/apache/tika/parser/pdf/OCR2XHTML.java | 127 +
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 518 +---
.../org/apache/tika/parser/pdf/PDFParser.java | 7 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 274 +-
.../tika/parser/pkg/ZipContainerDetector.java | 12 +
.../tika/parser/rtf/RTFEmbObjHandler.java | 7 +-
.../tika/parser/rtf/RTFObjDataParser.java | 43 +-
.../apache/tika/parser/rtf/TextExtractor.java | 11 +-
.../services/org.apache.tika.parser.Parser | 7 +-
.../apache/tika/parser/pdf/PDFParser.properties | 10 +-
.../tika/detect/TestContainerAwareDetector.java | 11 +
.../org/apache/tika/mime/TestMimeTypes.java | 38 +-
.../parser/apple/AppleSingleFileParserTest.java | 46 +
.../apache/tika/parser/dbf/DBFParserTest.java | 158 ++
.../apache/tika/parser/html/HtmlParserTest.java | 60 +-
.../parser/image/xmp/JempboxExtractorTest.java | 29 +-
.../tika/parser/mail/RFC822ParserTest.java | 115 +
.../tika/parser/microsoft/ExcelParserTest.java | 28 +-
.../parser/microsoft/MSOwnerFileParserTest.java | 31 +
.../microsoft/POIContainerExtractionTest.java | 4 +-
.../parser/microsoft/PowerPointParserTest.java | 13 +-
.../tika/parser/microsoft/WordParserTest.java | 19 +
.../ooxml/OOXMLContainerExtractionTest.java | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 43 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 109 +
.../apache/tika/parser/pdf/PDFParserTest.java | 74 +-
.../apache/tika/parser/rtf/RTFParserTest.java | 127 +-
.../test-documents/testAppleSingleFile.pdf | Bin 0 -> 1893 bytes
.../test/resources/test-documents/testDBF.dbf | Bin 0 -> 890 bytes
.../test-documents/testDBF_gb18030.dbf | Bin 0 -> 144 bytes
.../test/resources/test-documents/testDJVU.djvu | Bin 0 -> 89 bytes
.../resources/test-documents/testEXCEL2003.xml | 100 +
.../test-documents/testEXCEL_hyperlinks.xls | Bin 0 -> 29696 bytes
.../test-documents/testEXCEL_hyperlinks.xlsx | Bin 0 -> 10038 bytes
.../resources/test-documents/testEXCEL_poi.xlsx | Bin 0 -> 3360 bytes
.../test-documents/testEndNoteImportFile.enw | 10 +
.../test-documents/testExcel_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testExcel_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../resources/test-documents/testICalendar.ics | 15 +
.../test-documents/testKeynote2013.key | Bin 0 -> 274397 bytes
.../resources/test-documents/testKeynoteNew.key | Bin 274397 -> 0 bytes
.../resources/test-documents/testMSOwnerFile | Bin 0 -> 162 bytes
.../test-documents/testNumbers2013.numbers | Bin 0 -> 179147 bytes
.../test-documents/testNumbersNew.numbers | Bin 179147 -> 0 bytes
.../test-documents/testPPT_EmbeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_EmbeddedPDF.pptx | Bin 0 -> 108637 bytes
.../test-documents/testPages2013.pages | Bin 0 -> 237567 bytes
.../resources/test-documents/testPagesNew.pages | Bin 237567 -> 0 bytes
.../test-documents/testRFC822_date_utf8 | 8 +
.../resources/test-documents/testRFC822_eml | 33 +
.../resources/test-documents/testVCalendar.vcs | 10 +
.../resources/test-documents/testWORD2003.xml | 2542 ++++++++++++++++++
.../test-documents/testWindowsMediaMeta.asx | 6 +
.../test/resources/test-documents/testXMP.xmp | 178 ++
.../test-documents/test_recursive_embedded.doc | Bin 0 -> 31744 bytes
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 2 +-
tika-translate/pom.xml | 2 +-
.../translate/translator.yandex.properties | 2 +-
tika-xmp/pom.xml | 2 +-
114 files changed, 7822 insertions(+), 1203 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/fe559b80/tika-parsers/pom.xml
----------------------------------------------------------------------
[5/7] tika git commit: Merge branch 'master' into TIKA-1343
Posted by le...@apache.org.
Merge branch 'master' into TIKA-1343
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d50a6936
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d50a6936
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d50a6936
Branch: refs/heads/master
Commit: d50a69361bd0196fb2595313cb47222f61701ba4
Parents: a1250ff 07aea36
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:06:47 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:06:47 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 30 +
tika-bundle/pom.xml | 2 +-
.../main/java/org/apache/tika/config/Field.java | 45 +
.../org/apache/tika/config/Initializable.java | 33 +
.../main/java/org/apache/tika/config/Param.java | 191 +++++
.../java/org/apache/tika/config/ParamField.java | 158 ++++
.../java/org/apache/tika/config/TikaConfig.java | 47 +-
.../tika/exception/TikaConfigException.java | 39 +
.../org/apache/tika/parser/AbstractParser.java | 10 +
.../java/org/apache/tika/parser/Parser.java | 1 +
.../tika/parser/external/ExternalParser.java | 85 +-
.../apache/tika/sax/XHTMLContentHandler.java | 5 +-
.../org/apache/tika/utils/AnnotationUtils.java | 138 +++
.../apache/tika/utils/ServiceLoaderUtils.java | 30 +
.../org/apache/tika/mime/tika-mimetypes.xml | 67 +-
.../java/org/apache/tika/config/ParamTest.java | 71 ++
.../tika/parser/DummyInitializableParser.java | 68 ++
.../tika/parser/DummyParameterizedParser.java | 113 +++
.../tika/parser/InitializableParserTest.java | 45 +
.../tika/parser/ParameterizedParserTest.java | 125 +++
.../apache/tika/utils/AnnotationUtilsTest.java | 190 +++++
.../tika/config/TIKA-1508-configurable.xml | 37 +
.../tika/config/TIKA-1986-bad-parameters.xml | 26 +
.../apache/tika/config/TIKA-1986-bad-types.xml | 26 +
.../apache/tika/config/TIKA-1986-bad-values.xml | 26 +
.../tika/config/TIKA-1986-initializable.xml | 28 +
.../TIKA-1986-parameterized-decorated.xml | 39 +
.../tika/config/TIKA-1986-parameterized.xml | 38 +
.../tika/config/TIKA-1986-some-parameters.xml | 28 +
tika-parent/pom.xml | 12 +-
tika-parsers/pom.xml | 26 +-
.../chm/accessor/ChmDirectoryListingSet.java | 11 +-
.../apache/tika/parser/chm/core/ChmCommons.java | 5 +-
.../tika/parser/chm/core/ChmExtractor.java | 4 +-
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 4 +-
.../tika/parser/mail/MailContentHandler.java | 13 +-
.../org/apache/tika/parser/mat/MatParser.java | 5 +
.../tika/parser/microsoft/ExcelExtractor.java | 34 +-
.../microsoft/TikaExcelDataFormatter.java | 41 +
.../microsoft/TikaExcelGeneralFormat.java | 90 ++
.../tika/parser/microsoft/WordExtractor.java | 20 +
.../microsoft/ooxml/MetadataExtractor.java | 15 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 20 +-
.../ooxml/XWPFWordExtractorDecorator.java | 52 +-
.../microsoft/xml/AbstractXML2003Parser.java | 4 +
.../tika/parser/microsoft/xml/WordMLParser.java | 3 +
.../tika/parser/ocr/TesseractOCRConfig.java | 181 +++-
.../tika/parser/ocr/TesseractOCRParser.java | 113 ++-
.../parser/odf/OpenDocumentContentParser.java | 3 +
.../tika/parser/pdf/AbstractPDF2XHTML.java | 16 +-
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 1 -
.../org/apache/tika/parser/pdf/PDFParser.java | 29 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 86 +-
.../parser/recognition/ObjectRecogniser.java | 75 ++
.../recognition/ObjectRecognitionParser.java | 171 ++++
.../parser/recognition/RecognisedObject.java | 91 ++
.../tf/TensorflowImageRecParser.java | 152 ++++
.../tf/TensorflowRESTRecogniser.java | 142 ++++
.../apache/tika/parser/txt/CharsetDetector.java | 416 +++++----
.../apache/tika/parser/txt/CharsetMatch.java | 139 ++-
.../tika/parser/txt/CharsetRecog_2022.java | 28 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 24 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 99 ++-
.../tika/parser/txt/CharsetRecog_mbcs.java | 44 +-
.../tika/parser/txt/CharsetRecog_sbcs.java | 835 ++++++++++---------
.../tika/parser/txt/CharsetRecognizer.java | 31 +-
.../parser/ocr/TesseractOCRConfig.properties | 13 +-
.../org/apache/tika/parser/ocr/rotation.py | 72 ++
.../recognition/tf/InceptionRestDockerfile | 41 +
.../parser/recognition/tf/classify_image.py | 212 +++++
.../tika/parser/recognition/tf/inceptionapi.py | 319 +++++++
.../org/apache/tika/mime/TestMimeTypes.java | 13 +
.../tika/parser/chm/TestChmExtractor.java | 21 +-
.../apache/tika/parser/html/HtmlParserTest.java | 140 +++-
.../tika/parser/mail/RFC822ParserTest.java | 68 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 1 -
.../tika/parser/microsoft/ExcelParserTest.java | 10 +
.../tika/parser/microsoft/WordParserTest.java | 11 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 28 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 1 +
.../tika/parser/ocr/TesseractOCRConfigTest.java | 61 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 18 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 44 +-
.../ObjectRecognitionParserTest.java | 89 ++
.../tf/TensorflowImageRecParserTest.java | 58 ++
.../parser/pdf/tika-config-non-primitives.xml | 29 +
.../org/apache/tika/parser/pdf/tika-config.xml | 26 +
.../recognition/tika-config-tflow-rest.xml | 30 +
.../parser/recognition/tika-config-tflow.xml | 29 +
.../resources/test-documents/testChm_oom.chm | Bin 0 -> 4315 bytes
.../test-documents/testEXCEL_big_numbers.xls | Bin 0 -> 26112 bytes
.../test-documents/testEXCEL_big_numbers.xlsx | Bin 0 -> 8396 bytes
.../test-documents/testEmailWithPNGAtt.eml | 354 ++++++++
.../resources/test-documents/testHTML_head.html | 32 +
.../test-documents/testOpenOffice2.odt | Bin 26448 -> 26460 bytes
.../resources/test-documents/testStataDTA.dta | Bin 0 -> 1207 bytes
.../resources/test-documents/testStataDTA.txt | 15 +
.../resources/test-documents/testWORD2003.xml | 2 +-
.../test-documents/testWORD_boldHyperlink.doc | Bin 0 -> 27136 bytes
.../test-documents/testWORD_boldHyperlink.docx | Bin 0 -> 12382 bytes
.../testWORD_totalTimeOutOfRange.docx | Bin 0 -> 11047 bytes
.../TesseractOCRConfig-full.properties | 6 +
.../TesseractOCRConfig-partial.properties | 8 +-
tika-translate/pom.xml | 2 +-
104 files changed, 5612 insertions(+), 917 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d50a6936/tika-parsers/pom.xml
----------------------------------------------------------------------