You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/20 12:01:13 UTC

svn commit: r1696748 - in /tika/trunk/tika-app/src: main/java/org/apache/tika/cli/ main/java/org/apache/tika/gui/ test/java/org/apache/tika/cli/

Author: nick
Date: Thu Aug 20 10:01:13 2015
New Revision: 1696748

URL: http://svn.apache.org/r1696748
Log:
TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Aug 20 10:01:13 2015
@@ -52,6 +52,9 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeSet;
 
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.log4j.Level;
@@ -72,9 +75,6 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.gui.TikaGUI;
-import org.apache.tika.io.CloseShieldInputStream;
-import org.apache.tika.io.FilenameUtils;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.language.LanguageProfilerBuilder;
 import org.apache.tika.language.ProfilingHandler;
@@ -106,6 +106,8 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Simple command line interface for Apache Tika.
  */
@@ -855,7 +857,7 @@ public class TikaCLI {
         for (File mf : dir.listFiles()) {
             if (mf.isFile()) {
                 BufferedReader r = new BufferedReader(new InputStreamReader(
-                        new FileInputStream(mf), IOUtils.UTF_8));
+                        new FileInputStream(mf), UTF_8));
                 String line;
                 while ((line = r.readLine()) != null) {
                     if (line.startsWith("!:mime") ||
@@ -969,7 +971,7 @@ public class TikaCLI {
         } else if (System.getProperty("os.name")
                 .toLowerCase(Locale.ROOT).startsWith("mac os x")) {
             // TIKA-324: Override the default encoding on Mac OS X
-            return new OutputStreamWriter(output, IOUtils.UTF_8);
+            return new OutputStreamWriter(output, UTF_8);
         } else {
             return new OutputStreamWriter(output, Charset.defaultCharset());
         }

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Aug 20 10:01:13 2015
@@ -47,10 +47,6 @@ import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
 import java.awt.event.KeyEvent;
 import java.awt.event.WindowEvent;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.awt.event.KeyEvent;
-import java.awt.event.WindowEvent;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -65,10 +61,10 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
@@ -91,6 +87,8 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
  * of the window to have them parsed.
@@ -481,11 +479,8 @@ public class TikaGUI extends JFrame
                 URL url = e.getURL();
                 InputStream stream = url.openStream();
                 try {
-                    StringWriter writer = new StringWriter();
-                    IOUtils.copy(stream, writer, IOUtils.UTF_8.name());
-
                     JEditorPane editor =
-                        new JEditorPane("text/plain", writer.toString());
+                        new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
                     editor.setEditable(false);
                     editor.setBackground(Color.WHITE);
                     editor.setCaretPosition(0);

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java Thu Aug 20 10:01:13 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.cli;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
@@ -27,7 +28,7 @@ import java.util.LinkedHashMap;
 import java.util.Map;
 
 import org.apache.commons.io.FileUtils;
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -47,7 +48,7 @@ public class TikaCLIBatchCommandLineTest
         OutputStream os = null;
         try {
             os = new FileOutputStream(testFile);
-            IOUtils.write("test output", os, "UTF-8");
+            IOUtils.write("test output", os, UTF_8);
         } catch (IOException e) {
             throw new RuntimeException("Couldn't open testFile");
         } finally {

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java Thu Aug 20 10:01:13 2015
@@ -17,6 +17,7 @@
 
 package org.apache.tika.cli;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
@@ -31,7 +32,7 @@ import java.io.Reader;
 import java.util.List;
 
 import org.apache.commons.io.FileUtils;
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.parser.RecursiveParserWrapper;
@@ -54,9 +55,9 @@ public class TikaCLIBatchIntegrationTest
         tempDir.delete();
         tempDir.mkdir();
         outBuffer = new ByteArrayOutputStream();
-        PrintStream outWriter = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+        PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name());
         ByteArrayOutputStream errBuffer = new ByteArrayOutputStream();
-        PrintStream errWriter = new PrintStream(errBuffer, true, IOUtils.UTF_8.name());
+        PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name());
         out = System.out;
         err = System.err;
         System.setOut(outWriter);
@@ -65,8 +66,8 @@ public class TikaCLIBatchIntegrationTest
 
     @After
     public void tearDown() throws Exception {
-        System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name()));
-        System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name()));
+        System.setOut(new PrintStream(out, true, UTF_8.name()));
+        System.setErr(new PrintStream(err, true, UTF_8.name()));
         FileUtils.deleteDirectory(tempDir);
     }
 
@@ -104,7 +105,7 @@ public class TikaCLIBatchIntegrationTest
             };
             TikaCLI.main(params);
             reader = new InputStreamReader(
-                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
             List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
             assertEquals(12, metadataList.size());
             assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
@@ -123,7 +124,7 @@ public class TikaCLIBatchIntegrationTest
 
         assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
         assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
-        String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8);
+        String sysOutString = new String(outBuffer.toByteArray(), UTF_8);
         assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
     }
 
@@ -139,7 +140,7 @@ public class TikaCLIBatchIntegrationTest
             };
             TikaCLI.main(params);
             reader = new InputStreamReader(
-                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
             List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
             assertEquals(12, metadataList.size());
             assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5"));
@@ -159,7 +160,7 @@ public class TikaCLIBatchIntegrationTest
             };
             TikaCLI.main(params);
             reader = new InputStreamReader(
-                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8);
             List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
             assertEquals(12, metadataList.size());
             assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1696748&r1=1696747&r2=1696748&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Thu Aug 20 10:01:13 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.cli;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
@@ -26,7 +27,6 @@ import java.net.URI;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -50,7 +50,7 @@ public class TikaCLITest {
         outContent = new ByteArrayOutputStream();
         resourcePrefix = testDataURI.toString();
         stdout = System.out;
-        System.setOut(new PrintStream(outContent, true, IOUtils.UTF_8.name()));
+        System.setOut(new PrintStream(outContent, true, UTF_8.name()));
     }
 
     /**
@@ -74,7 +74,7 @@ public class TikaCLITest {
     public void testListParserDetail() throws Exception{
         String[] params = {"--list-parser-detail"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
     }
 
     /**
@@ -99,11 +99,11 @@ public class TikaCLITest {
     public void testXMLOutput() throws Exception{
         String[] params = {"-x", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
 
         params = new String[]{"-x", "--digest=SHA256", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name())
+        assertTrue(outContent.toString(UTF_8.name())
                 .contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
 
     }
@@ -119,7 +119,7 @@ public class TikaCLITest {
         TikaCLI.main(params);
         assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
         assertTrue("Expanded <title></title> element should be present",
-                outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
+                outContent.toString(UTF_8.name()).contains("<title></title>"));
 
         params = new String[]{"-h", "--digest=SHA384", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
@@ -136,7 +136,7 @@ public class TikaCLITest {
     public void testTextOutput() throws Exception{
         String[] params = {"-t", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("finished off the cake"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake"));
     }
 
     /**
@@ -147,12 +147,12 @@ public class TikaCLITest {
     public void testMetadataOutput() throws Exception{
         String[] params = {"-m", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
 
         params = new String[]{"-m", "--digest=SHA512", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
-        assertTrue(outContent.toString(IOUtils.UTF_8.name())
+        assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
+        assertTrue(outContent.toString(UTF_8.name())
                 .contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
     }
 
@@ -165,7 +165,7 @@ public class TikaCLITest {
     public void testJsonMetadataOutput() throws Exception {
         String[] params = {"--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html"};
         TikaCLI.main(params);
-        String json = outContent.toString(IOUtils.UTF_8.name());
+        String json = outContent.toString(UTF_8.name());
         //TIKA-1310
         assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
         
@@ -187,7 +187,7 @@ public class TikaCLITest {
     public void testJsonMetadataPrettyPrintOutput() throws Exception {
         String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
         TikaCLI.main(params);
-        String json = outContent.toString(IOUtils.UTF_8.name());
+        String json = outContent.toString(UTF_8.name());
 
         assertTrue(json.contains("  \"X-Parsed-By\": [\n" +
                 "    \"org.apache.tika.parser.DefaultParser\",\n" +
@@ -210,7 +210,7 @@ public class TikaCLITest {
     public void testLanguageOutput() throws Exception{
         String[] params = {"-l", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("en"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("en"));
     }
 
     /**
@@ -222,7 +222,7 @@ public class TikaCLITest {
     public void testDetectOutput() throws Exception{
         String[] params = {"-d", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
     }
 
     /**
@@ -234,7 +234,7 @@ public class TikaCLITest {
     public void testListMetModels() throws Exception{
         String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
     }
 
     /**
@@ -246,7 +246,7 @@ public class TikaCLITest {
     public void testListSupportedTypes() throws Exception{
         String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
-        assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("supertype: application/octet-stream"));
+        assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream"));
     }
 
     /**
@@ -319,7 +319,7 @@ public class TikaCLITest {
     public void testMultiValuedMetadata() throws Exception {
         String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("sheetNames: Checking"));
         assertTrue(content.contains("sheetNames: Secon sheet"));
         assertTrue(content.contains("sheetNames: Logical Sheet 3"));
@@ -333,7 +333,7 @@ public class TikaCLITest {
         new File("subdir/foo.txt").delete();
         new File("subdir").delete();
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
         // clean up. TODO: These should be in target.
         new File("target/subdir/foo.txt").delete();
@@ -359,7 +359,7 @@ public class TikaCLITest {
     public void testConfig() throws Exception {
         String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("apple"));
         assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
     }
@@ -368,7 +368,7 @@ public class TikaCLITest {
     public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
         String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("[\n" +
                 "  {\n" +
                 "    \"Application-Name\": \"Microsoft Office Word\",\n" +
@@ -384,7 +384,7 @@ public class TikaCLITest {
     public void testJsonRecursiveMetadataParserDefault() throws Exception {
         String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
     }
 
@@ -392,7 +392,7 @@ public class TikaCLITest {
     public void testJsonRecursiveMetadataParserText() throws Exception {
         String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("\\n\\nembed_4\\n"));
         assertTrue(content.contains("\\n\\nembed_0"));
     }
@@ -401,7 +401,7 @@ public class TikaCLITest {
     public void testDigestInJson() throws Exception {
         String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
-        String content = outContent.toString(IOUtils.UTF_8.name());
+        String content = outContent.toString(UTF_8.name());
         assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\","));
         assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
     }