You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/20 11:40:08 UTC

svn commit: r1696741 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: config/ detect/ embedder/ fork/ io/ language/ parser/external/

Author: nick
Date: Thu Aug 20 09:40:08 2015
New Revision: 1696741

URL: http://svn.apache.org/r1696741
Log:
TIKA-1710 patch from Yaniv Kunda - Use java.nio.charset.StandardCharsets

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Thu Aug 20 09:40:08 2015
@@ -29,7 +29,8 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Pattern;
-import org.apache.tika.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * Internal utility class that Tika uses to look up service providers.
@@ -345,7 +346,7 @@ public class ServiceLoader {
         InputStream stream = resource.openStream();
         try {
             BufferedReader reader =
-                new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+                new BufferedReader(new InputStreamReader(stream, UTF_8));
             String line = reader.readLine();
             while (line != null) {
                 line = COMMENT.matcher(line).replaceFirst("");

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Thu Aug 20 09:40:08 2015
@@ -21,14 +21,15 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
-import java.nio.charset.Charset;
 import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Content type detection based on magic bytes, i.e. type-specific patterns
  * near the beginning of the document input stream.
@@ -41,8 +42,6 @@ import org.apache.tika.mime.MediaType;
  */
 public class MagicDetector implements Detector {
 
-    private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
-
     public static MagicDetector parse(
             MediaType mediaType,
             String type, String offset, String value, String mask) {
@@ -98,7 +97,7 @@ public class MagicDetector implements De
         } else if (type.equals("stringignorecase")) {
             decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
         } else if (type.equals("byte")) {
-            decoded = tmpVal.getBytes(IOUtils.UTF_8);
+            decoded = tmpVal.getBytes(UTF_8);
         } else if (type.equals("host16") || type.equals("little16")) {
             int i = Integer.parseInt(tmpVal, radix);
             decoded = new byte[] { (byte) (i & 0x00FF), (byte) (i >> 8) };
@@ -394,7 +393,7 @@ public class MagicDetector implements De
                     flags = Pattern.CASE_INSENSITIVE;
                 }
                 
-                Pattern p = Pattern.compile(new String(this.pattern, IOUtils.UTF_8), flags);
+                Pattern p = Pattern.compile(new String(this.pattern, UTF_8), flags);
 
                 ByteBuffer bb = ByteBuffer.wrap(buffer);
                 CharBuffer result = ISO_8859_1.decode(bb);

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java Thu Aug 20 09:40:08 2015
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -21,13 +20,14 @@ package org.apache.tika.detect;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 
 import org.apache.tika.mime.MediaType;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 public class NNExampleModelDetector extends TrainedModelDetector {
 	private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel";
 
@@ -43,14 +43,8 @@ public class NNExampleModelDetector exte
 
 	@Override
 	public void loadDefaultModels(InputStream modelStream) {
-	    BufferedReader bReader = null;
-	    try{
-                bReader = new BufferedReader(
-			         new InputStreamReader(modelStream, "UTF-8"));
-	    }
-	    catch(UnsupportedEncodingException e){
-                e.printStackTrace();
-	    }
+	    BufferedReader bReader =
+                new BufferedReader(new InputStreamReader(modelStream, UTF_8));
 
 		NNTrainedModelBuilder nnBuilder = new NNTrainedModelBuilder();
 		String line;

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java Thu Aug 20 09:40:08 2015
@@ -22,10 +22,11 @@ import java.net.URLDecoder;
 import java.util.Map;
 import java.util.regex.Pattern;
 
-import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Content type detection based on the resource name. An instance of this
  * class contains a set of regular expression patterns that are matched
@@ -120,7 +121,7 @@ public class NameDetector implements Det
             int percent = name.indexOf('%');
             if (percent != -1) {
                 try {
-                    name = URLDecoder.decode(name, IOUtils.UTF_8.name());
+                    name = URLDecoder.decode(name, UTF_8.name());
                 } catch (UnsupportedEncodingException e) {
                     throw new IllegalStateException("UTF-8 not supported", e);
                 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java Thu Aug 20 09:40:08 2015
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.detect;
 
-import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -36,6 +35,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.io.TemporaryResources;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 public abstract class TrainedModelDetector implements Detector {
 	private final Map<MediaType, TrainedModel> MODEL_MAP = new HashMap<MediaType, TrainedModel>();
 
@@ -146,7 +147,7 @@ public abstract class TrainedModelDetect
 	private synchronized void writeHisto(final float[] histogram)
 			throws IOException {
 	        String histPath = new TemporaryResources().createTemporaryFile().getAbsolutePath();
-	        Writer writer = new OutputStreamWriter(new FileOutputStream(histPath),"UTF-8");
+	        Writer writer = new OutputStreamWriter(new FileOutputStream(histPath), UTF_8);
 		int n = histogram.length;// excluding the last one for storing the
 									// max value
 		for (int i = 0; i < n; i++) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java Thu Aug 20 09:40:08 2015
@@ -39,6 +39,8 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.external.ExternalParser;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Embedder that uses an external program (like sed or exiftool) to embed text
  * content and metadata into a given document.
@@ -413,7 +415,7 @@ public class ExternalEmbedder implements
             if (process.exitValue() != 0) {
                 throw new TikaException("There was an error executing the command line" +
                         "\nExecutable Command:\n\n" + cmd +
-                        "\nExecutable Error:\n\n" + stdErrOutputStream.toString(IOUtils.UTF_8.name()));
+                        "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
             }
         }
     }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java Thu Aug 20 09:40:08 2015
@@ -34,6 +34,8 @@ import org.apache.tika.io.IOExceptionWit
 import org.apache.tika.io.IOUtils;
 import org.xml.sax.ContentHandler;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 class ForkClient {
 
     private final List<ForkResource> resources = new ArrayList<ForkResource>();
@@ -262,7 +264,7 @@ class ForkClient {
             String manifest =
                 "Main-Class: " + ForkServer.class.getName() + "\n";
             jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF"));
-            jar.write(manifest.getBytes(IOUtils.UTF_8));
+            jar.write(manifest.getBytes(UTF_8));
 
             Class<?>[] bootstrap = {
                     ForkServer.class, ForkObjectInputStream.class,

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java Thu Aug 20 09:40:08 2015
@@ -30,10 +30,11 @@ import java.io.Reader;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.nio.channels.Channel;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * General IO stream manipulation utilities.
  * <p>
@@ -76,9 +77,8 @@ import java.util.List;
  * @since Apache Tika 0.4, copied (partially) from Commons IO 1.4
  */
 public class IOUtils {
-
-    //TODO: switch to StandardCharsets when we move to Java 1.7
-    public static final Charset UTF_8 = Charset.forName("UTF-8");
+    // TODO Remove this when we've finished TIKA-1706 and TIKA-1710
+    public static final Charset UTF_8 = java.nio.charset.StandardCharsets.UTF_8;
 
     /**
      * The default buffer size to use.
@@ -258,7 +258,7 @@ public class IOUtils {
      */
     @Deprecated
     public static byte[] toByteArray(String input) throws IOException {
-        return input.getBytes(IOUtils.UTF_8);
+        return input.getBytes(UTF_8);
     }
 
     // read char[]
@@ -396,7 +396,7 @@ public class IOUtils {
      */
     @Deprecated
     public static String toString(byte[] input) throws IOException {
-        return new String(input, IOUtils.UTF_8);
+        return new String(input, UTF_8);
     }
 
     /**
@@ -418,7 +418,7 @@ public class IOUtils {
             throws IOException {
         // If no encoding is specified, default to UTF-8.
         if (encoding == null) {
-            return new String(input, IOUtils.UTF_8);
+            return new String(input, UTF_8);
         } else {
             return new String(input, encoding);
         }
@@ -440,7 +440,7 @@ public class IOUtils {
      * @since Commons IO 1.1
      */
     public static List<String> readLines(InputStream input) throws IOException {
-        InputStreamReader reader = new InputStreamReader(input, IOUtils.UTF_8);
+        InputStreamReader reader = new InputStreamReader(input, UTF_8);
         return readLines(reader);
     }
 
@@ -534,7 +534,7 @@ public class IOUtils {
      * @since Commons IO 1.1
      */
     public static InputStream toInputStream(String input) {
-        byte[] bytes = input.getBytes(IOUtils.UTF_8);
+        byte[] bytes = input.getBytes(UTF_8);
         return new ByteArrayInputStream(bytes);
     }
 
@@ -552,7 +552,7 @@ public class IOUtils {
      * @since Commons IO 1.1
      */
     public static InputStream toInputStream(String input, String encoding) throws IOException {
-        byte[] bytes = encoding != null ? input.getBytes(encoding) : input.getBytes(IOUtils.UTF_8);
+        byte[] bytes = encoding != null ? input.getBytes(encoding) : input.getBytes(UTF_8);
         return new ByteArrayInputStream(bytes);
     }
 
@@ -590,7 +590,7 @@ public class IOUtils {
      */
     public static void write(byte[] data, Writer output) throws IOException {
         if (data != null) {
-            output.write(new String(data, IOUtils.UTF_8));
+            output.write(new String(data, UTF_8));
         }
     }
 
@@ -658,7 +658,7 @@ public class IOUtils {
     public static void write(char[] data, OutputStream output)
             throws IOException {
         if (data != null) {
-            output.write(new String(data).getBytes(IOUtils.UTF_8));
+            output.write(new String(data).getBytes(UTF_8));
         }
     }
 
@@ -784,7 +784,7 @@ public class IOUtils {
     public static void write(String data, OutputStream output)
             throws IOException {
         if (data != null) {
-            output.write(data.getBytes(IOUtils.UTF_8));
+            output.write(data.getBytes(UTF_8));
         }
     }
 
@@ -853,7 +853,7 @@ public class IOUtils {
     public static void write(StringBuffer data, OutputStream output)
             throws IOException {
         if (data != null) {
-            output.write(data.toString().getBytes(IOUtils.UTF_8));
+            output.write(data.toString().getBytes(UTF_8));
         }
     }
 
@@ -959,7 +959,7 @@ public class IOUtils {
      */
     public static void copy(InputStream input, Writer output)
             throws IOException {
-        InputStreamReader in = new InputStreamReader(input, IOUtils.UTF_8);
+        InputStreamReader in = new InputStreamReader(input, UTF_8);
         copy(in, output);
     }
 
@@ -1066,7 +1066,7 @@ public class IOUtils {
      */
     public static void copy(Reader input, OutputStream output)
             throws IOException {
-        OutputStreamWriter out = new OutputStreamWriter(output, IOUtils.UTF_8);
+        OutputStreamWriter out = new OutputStreamWriter(output, UTF_8);
         copy(input, out);
         // XXX Unless anyone is planning on rewriting OutputStreamWriter, we
         // have to flush here.

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Thu Aug 20 09:40:08 2015
@@ -25,7 +25,7 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 
-import org.apache.tika.io.IOUtils;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * Identifier of the language that best matches a given content profile.
@@ -77,7 +77,7 @@ public class LanguageIdentifier {
                 LanguageIdentifier.class.getResourceAsStream(language + PROFILE_SUFFIX);
             try {
                 BufferedReader reader =
-                    new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+                    new BufferedReader(new InputStreamReader(stream, UTF_8));
                 String line = reader.readLine();
                 while (line != null) {
                     if (line.length() > 0 && !line.startsWith("#")) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java Thu Aug 20 09:40:08 2015
@@ -35,7 +35,9 @@ import java.util.List;
 import java.util.Map;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * This class runs a ngram analysis over submitted text, results might be used
  * for automatic language identification.
@@ -342,7 +344,7 @@ public class LanguageProfilerBuilder {
 
         ngrams.clear();
         ngramcounts = new int[maxLength + 1];
-        BufferedReader reader = new BufferedReader(new InputStreamReader(is, IOUtils.UTF_8));
+        BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8));
         String line = null;
 
         while ((line = reader.readLine()) != null) {
@@ -406,7 +408,7 @@ public class LanguageProfilerBuilder {
      */
     public void save(OutputStream os) throws IOException {
         os.write(("# NgramProfile generated at " + new Date() + 
-                  " for Apache Tika Language Identification\n").getBytes(IOUtils.UTF_8));
+                  " for Apache Tika Language Identification\n").getBytes(UTF_8));
 
         // And then each ngram
 
@@ -433,7 +435,7 @@ public class LanguageProfilerBuilder {
         for (int i = 0; i < list.size(); i++) {
             NGramEntry e = list.get(i);
             String line = e.toString() + " " + e.getCount() + "\n";
-            os.write(line.getBytes(IOUtils.UTF_8));
+            os.write(line.getBytes(UTF_8));
         }
         os.flush();
     }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Thu Aug 20 09:40:08 2015
@@ -44,6 +44,8 @@ import org.apache.tika.sax.XHTMLContentH
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Parser that uses an external program (like catdoc or pdf2txt) to extract
  *  text content and metadata from a given document.
@@ -236,7 +238,7 @@ public class ExternalParser extends Abst
      */
     private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
             throws SAXException, IOException {
-        Reader reader = new InputStreamReader(stream, IOUtils.UTF_8);
+        Reader reader = new InputStreamReader(stream, UTF_8);
         try {
             xhtml.startDocument();
             xhtml.startElement("p");
@@ -307,7 +309,7 @@ public class ExternalParser extends Abst
        Thread t = new Thread() {
           public void run() {
              BufferedReader reader;
-              reader = new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+              reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
              try {
                 String line;
                 while ( (line = reader.readLine()) != null ) {