You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/20 12:01:13 UTC
svn commit: r1696748 - in /tika/trunk/tika-app/src:
main/java/org/apache/tika/cli/ main/java/org/apache/tika/gui/
test/java/org/apache/tika/cli/
Author: nick
Date: Thu Aug 20 10:01:13 2015
New Revision: 1696748
URL: http://svn.apache.org/r1696748
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Aug 20 10:01:13 2015
@@ -52,6 +52,9 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.Level;
@@ -72,9 +75,6 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.gui.TikaGUI;
-import org.apache.tika.io.CloseShieldInputStream;
-import org.apache.tika.io.FilenameUtils;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.LanguageProfilerBuilder;
import org.apache.tika.language.ProfilingHandler;
@@ -106,6 +106,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Simple command line interface for Apache Tika.
*/
@@ -855,7 +857,7 @@ public class TikaCLI {
for (File mf : dir.listFiles()) {
if (mf.isFile()) {
BufferedReader r = new BufferedReader(new InputStreamReader(
- new FileInputStream(mf), IOUtils.UTF_8));
+ new FileInputStream(mf), UTF_8));
String line;
while ((line = r.readLine()) != null) {
if (line.startsWith("!:mime") ||
@@ -969,7 +971,7 @@ public class TikaCLI {
} else if (System.getProperty("os.name")
.toLowerCase(Locale.ROOT).startsWith("mac os x")) {
// TIKA-324: Override the default encoding on Mac OS X
- return new OutputStreamWriter(output, IOUtils.UTF_8);
+ return new OutputStreamWriter(output, UTF_8);
} else {
return new OutputStreamWriter(output, Charset.defaultCharset());
}
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Aug 20 10:01:13 2015
@@ -47,10 +47,6 @@ import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowEvent;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.awt.event.KeyEvent;
-import java.awt.event.WindowEvent;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -65,10 +61,10 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
@@ -91,6 +87,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
* of the window to have them parsed.
@@ -481,11 +479,8 @@ public class TikaGUI extends JFrame
URL url = e.getURL();
InputStream stream = url.openStream();
try {
- StringWriter writer = new StringWriter();
- IOUtils.copy(stream, writer, IOUtils.UTF_8.name());
-
JEditorPane editor =
- new JEditorPane("text/plain", writer.toString());
+ new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java Thu Aug 20 10:01:13 2015
@@ -16,6 +16,7 @@
*/
package org.apache.tika.cli;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -27,7 +28,7 @@ import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -47,7 +48,7 @@ public class TikaCLIBatchCommandLineTest
OutputStream os = null;
try {
os = new FileOutputStream(testFile);
- IOUtils.write("test output", os, "UTF-8");
+ IOUtils.write("test output", os, UTF_8);
} catch (IOException e) {
throw new RuntimeException("Couldn't open testFile");
} finally {
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java Thu Aug 20 10:01:13 2015
@@ -17,6 +17,7 @@
package org.apache.tika.cli;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
@@ -31,7 +32,7 @@ import java.io.Reader;
import java.util.List;
import org.apache.commons.io.FileUtils;
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.RecursiveParserWrapper;
@@ -54,9 +55,9 @@ public class TikaCLIBatchIntegrationTest
tempDir.delete();
tempDir.mkdir();
outBuffer = new ByteArrayOutputStream();
- PrintStream outWriter = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+ PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name());
ByteArrayOutputStream errBuffer = new ByteArrayOutputStream();
- PrintStream errWriter = new PrintStream(errBuffer, true, IOUtils.UTF_8.name());
+ PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name());
out = System.out;
err = System.err;
System.setOut(outWriter);
@@ -65,8 +66,8 @@ public class TikaCLIBatchIntegrationTest
@After
public void tearDown() throws Exception {
- System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name()));
- System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name()));
+ System.setOut(new PrintStream(out, true, UTF_8.name()));
+ System.setErr(new PrintStream(err, true, UTF_8.name()));
FileUtils.deleteDirectory(tempDir);
}
@@ -104,7 +105,7 @@ public class TikaCLIBatchIntegrationTest
};
TikaCLI.main(params);
reader = new InputStreamReader(
- new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
@@ -123,7 +124,7 @@ public class TikaCLIBatchIntegrationTest
assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
- String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8);
+ String sysOutString = new String(outBuffer.toByteArray(), UTF_8);
assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
}
@@ -139,7 +140,7 @@ public class TikaCLIBatchIntegrationTest
};
TikaCLI.main(params);
reader = new InputStreamReader(
- new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5"));
@@ -159,7 +160,7 @@ public class TikaCLIBatchIntegrationTest
};
TikaCLI.main(params);
reader = new InputStreamReader(
- new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Thu Aug 20 10:01:13 2015
@@ -16,6 +16,7 @@
*/
package org.apache.tika.cli;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -26,7 +27,6 @@ import java.net.URI;
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -50,7 +50,7 @@ public class TikaCLITest {
outContent = new ByteArrayOutputStream();
resourcePrefix = testDataURI.toString();
stdout = System.out;
- System.setOut(new PrintStream(outContent, true, IOUtils.UTF_8.name()));
+ System.setOut(new PrintStream(outContent, true, UTF_8.name()));
}
/**
@@ -74,7 +74,7 @@ public class TikaCLITest {
public void testListParserDetail() throws Exception{
String[] params = {"--list-parser-detail"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
}
/**
@@ -99,11 +99,11 @@ public class TikaCLITest {
public void testXMLOutput() throws Exception{
String[] params = {"-x", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
params = new String[]{"-x", "--digest=SHA256", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name())
+ assertTrue(outContent.toString(UTF_8.name())
.contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
}
@@ -119,7 +119,7 @@ public class TikaCLITest {
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue("Expanded <title></title> element should be present",
- outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
+ outContent.toString(UTF_8.name()).contains("<title></title>"));
params = new String[]{"-h", "--digest=SHA384", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
@@ -136,7 +136,7 @@ public class TikaCLITest {
public void testTextOutput() throws Exception{
String[] params = {"-t", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("finished off the cake"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake"));
}
/**
@@ -147,12 +147,12 @@ public class TikaCLITest {
public void testMetadataOutput() throws Exception{
String[] params = {"-m", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
params = new String[]{"-m", "--digest=SHA512", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
- assertTrue(outContent.toString(IOUtils.UTF_8.name())
+ assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
+ assertTrue(outContent.toString(UTF_8.name())
.contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
}
@@ -165,7 +165,7 @@ public class TikaCLITest {
public void testJsonMetadataOutput() throws Exception {
String[] params = {"--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
- String json = outContent.toString(IOUtils.UTF_8.name());
+ String json = outContent.toString(UTF_8.name());
//TIKA-1310
assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
@@ -187,7 +187,7 @@ public class TikaCLITest {
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
- String json = outContent.toString(IOUtils.UTF_8.name());
+ String json = outContent.toString(UTF_8.name());
assertTrue(json.contains(" \"X-Parsed-By\": [\n" +
" \"org.apache.tika.parser.DefaultParser\",\n" +
@@ -210,7 +210,7 @@ public class TikaCLITest {
public void testLanguageOutput() throws Exception{
String[] params = {"-l", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("en"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("en"));
}
/**
@@ -222,7 +222,7 @@ public class TikaCLITest {
public void testDetectOutput() throws Exception{
String[] params = {"-d", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
@@ -234,7 +234,7 @@ public class TikaCLITest {
public void testListMetModels() throws Exception{
String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
@@ -246,7 +246,7 @@ public class TikaCLITest {
public void testListSupportedTypes() throws Exception{
String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
- assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("supertype: application/octet-stream"));
+ assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream"));
}
/**
@@ -319,7 +319,7 @@ public class TikaCLITest {
public void testMultiValuedMetadata() throws Exception {
String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("sheetNames: Checking"));
assertTrue(content.contains("sheetNames: Secon sheet"));
assertTrue(content.contains("sheetNames: Logical Sheet 3"));
@@ -333,7 +333,7 @@ public class TikaCLITest {
new File("subdir/foo.txt").delete();
new File("subdir").delete();
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
// clean up. TODO: These should be in target.
new File("target/subdir/foo.txt").delete();
@@ -359,7 +359,7 @@ public class TikaCLITest {
public void testConfig() throws Exception {
String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
}
@@ -368,7 +368,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("[\n" +
" {\n" +
" \"Application-Name\": \"Microsoft Office Word\",\n" +
@@ -384,7 +384,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserDefault() throws Exception {
String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
}
@@ -392,7 +392,7 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserText() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
@@ -401,7 +401,7 @@ public class TikaCLITest {
public void testDigestInJson() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
- String content = outContent.toString(IOUtils.UTF_8.name());
+ String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\","));
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
}