You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/20 11:40:08 UTC
svn commit: r1696741 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika: config/ detect/
embedder/ fork/ io/ language/ parser/external/
Author: nick
Date: Thu Aug 20 09:40:08 2015
New Revision: 1696741
URL: http://svn.apache.org/r1696741
Log:
TIKA-1710 patch from Yaniv Kunda - Use java.nio.charset.StandardCharsets
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java Thu Aug 20 09:40:08 2015
@@ -29,7 +29,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
-import org.apache.tika.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Internal utility class that Tika uses to look up service providers.
@@ -345,7 +346,7 @@ public class ServiceLoader {
InputStream stream = resource.openStream();
try {
BufferedReader reader =
- new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+ new BufferedReader(new InputStreamReader(stream, UTF_8));
String line = reader.readLine();
while (line != null) {
line = COMMENT.matcher(line).replaceFirst("");
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Thu Aug 20 09:40:08 2015
@@ -21,14 +21,15 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
-import java.nio.charset.Charset;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Content type detection based on magic bytes, i.e. type-specific patterns
* near the beginning of the document input stream.
@@ -41,8 +42,6 @@ import org.apache.tika.mime.MediaType;
*/
public class MagicDetector implements Detector {
- private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
-
public static MagicDetector parse(
MediaType mediaType,
String type, String offset, String value, String mask) {
@@ -98,7 +97,7 @@ public class MagicDetector implements De
} else if (type.equals("stringignorecase")) {
decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
} else if (type.equals("byte")) {
- decoded = tmpVal.getBytes(IOUtils.UTF_8);
+ decoded = tmpVal.getBytes(UTF_8);
} else if (type.equals("host16") || type.equals("little16")) {
int i = Integer.parseInt(tmpVal, radix);
decoded = new byte[] { (byte) (i & 0x00FF), (byte) (i >> 8) };
@@ -394,7 +393,7 @@ public class MagicDetector implements De
flags = Pattern.CASE_INSENSITIVE;
}
- Pattern p = Pattern.compile(new String(this.pattern, IOUtils.UTF_8), flags);
+ Pattern p = Pattern.compile(new String(this.pattern, UTF_8), flags);
ByteBuffer bb = ByteBuffer.wrap(buffer);
CharBuffer result = ISO_8859_1.decode(bb);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java Thu Aug 20 09:40:08 2015
@@ -1,4 +1,3 @@
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -21,13 +20,14 @@ package org.apache.tika.detect;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import org.apache.tika.mime.MediaType;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public class NNExampleModelDetector extends TrainedModelDetector {
private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel";
@@ -43,14 +43,8 @@ public class NNExampleModelDetector exte
@Override
public void loadDefaultModels(InputStream modelStream) {
- BufferedReader bReader = null;
- try{
- bReader = new BufferedReader(
- new InputStreamReader(modelStream, "UTF-8"));
- }
- catch(UnsupportedEncodingException e){
- e.printStackTrace();
- }
+ BufferedReader bReader =
+ new BufferedReader(new InputStreamReader(modelStream, UTF_8));
NNTrainedModelBuilder nnBuilder = new NNTrainedModelBuilder();
String line;
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java Thu Aug 20 09:40:08 2015
@@ -22,10 +22,11 @@ import java.net.URLDecoder;
import java.util.Map;
import java.util.regex.Pattern;
-import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Content type detection based on the resource name. An instance of this
* class contains a set of regular expression patterns that are matched
@@ -120,7 +121,7 @@ public class NameDetector implements Det
int percent = name.indexOf('%');
if (percent != -1) {
try {
- name = URLDecoder.decode(name, IOUtils.UTF_8.name());
+ name = URLDecoder.decode(name, UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("UTF-8 not supported", e);
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java Thu Aug 20 09:40:08 2015
@@ -16,7 +16,6 @@
*/
package org.apache.tika.detect;
-import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -36,6 +35,8 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.io.TemporaryResources;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
public abstract class TrainedModelDetector implements Detector {
private final Map<MediaType, TrainedModel> MODEL_MAP = new HashMap<MediaType, TrainedModel>();
@@ -146,7 +147,7 @@ public abstract class TrainedModelDetect
private synchronized void writeHisto(final float[] histogram)
throws IOException {
String histPath = new TemporaryResources().createTemporaryFile().getAbsolutePath();
- Writer writer = new OutputStreamWriter(new FileOutputStream(histPath),"UTF-8");
+ Writer writer = new OutputStreamWriter(new FileOutputStream(histPath), UTF_8);
int n = histogram.length;// excluding the last one for storing the
// max value
for (int i = 0; i < n; i++) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java Thu Aug 20 09:40:08 2015
@@ -39,6 +39,8 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Embedder that uses an external program (like sed or exiftool) to embed text
* content and metadata into a given document.
@@ -413,7 +415,7 @@ public class ExternalEmbedder implements
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the command line" +
"\nExecutable Command:\n\n" + cmd +
- "\nExecutable Error:\n\n" + stdErrOutputStream.toString(IOUtils.UTF_8.name()));
+ "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java Thu Aug 20 09:40:08 2015
@@ -34,6 +34,8 @@ import org.apache.tika.io.IOExceptionWit
import org.apache.tika.io.IOUtils;
import org.xml.sax.ContentHandler;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
class ForkClient {
private final List<ForkResource> resources = new ArrayList<ForkResource>();
@@ -262,7 +264,7 @@ class ForkClient {
String manifest =
"Main-Class: " + ForkServer.class.getName() + "\n";
jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF"));
- jar.write(manifest.getBytes(IOUtils.UTF_8));
+ jar.write(manifest.getBytes(UTF_8));
Class<?>[] bootstrap = {
ForkServer.class, ForkObjectInputStream.class,
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java Thu Aug 20 09:40:08 2015
@@ -30,10 +30,11 @@ import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.channels.Channel;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* General IO stream manipulation utilities.
* <p>
@@ -76,9 +77,8 @@ import java.util.List;
* @since Apache Tika 0.4, copied (partially) from Commons IO 1.4
*/
public class IOUtils {
-
- //TODO: switch to StandardCharsets when we move to Java 1.7
- public static final Charset UTF_8 = Charset.forName("UTF-8");
+ // TODO Remove this when we've finished TIKA-1706 and TIKA-1710
+ public static final Charset UTF_8 = java.nio.charset.StandardCharsets.UTF_8;
/**
* The default buffer size to use.
@@ -258,7 +258,7 @@ public class IOUtils {
*/
@Deprecated
public static byte[] toByteArray(String input) throws IOException {
- return input.getBytes(IOUtils.UTF_8);
+ return input.getBytes(UTF_8);
}
// read char[]
@@ -396,7 +396,7 @@ public class IOUtils {
*/
@Deprecated
public static String toString(byte[] input) throws IOException {
- return new String(input, IOUtils.UTF_8);
+ return new String(input, UTF_8);
}
/**
@@ -418,7 +418,7 @@ public class IOUtils {
throws IOException {
// If no encoding is specified, default to UTF-8.
if (encoding == null) {
- return new String(input, IOUtils.UTF_8);
+ return new String(input, UTF_8);
} else {
return new String(input, encoding);
}
@@ -440,7 +440,7 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static List<String> readLines(InputStream input) throws IOException {
- InputStreamReader reader = new InputStreamReader(input, IOUtils.UTF_8);
+ InputStreamReader reader = new InputStreamReader(input, UTF_8);
return readLines(reader);
}
@@ -534,7 +534,7 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static InputStream toInputStream(String input) {
- byte[] bytes = input.getBytes(IOUtils.UTF_8);
+ byte[] bytes = input.getBytes(UTF_8);
return new ByteArrayInputStream(bytes);
}
@@ -552,7 +552,7 @@ public class IOUtils {
* @since Commons IO 1.1
*/
public static InputStream toInputStream(String input, String encoding) throws IOException {
- byte[] bytes = encoding != null ? input.getBytes(encoding) : input.getBytes(IOUtils.UTF_8);
+ byte[] bytes = encoding != null ? input.getBytes(encoding) : input.getBytes(UTF_8);
return new ByteArrayInputStream(bytes);
}
@@ -590,7 +590,7 @@ public class IOUtils {
*/
public static void write(byte[] data, Writer output) throws IOException {
if (data != null) {
- output.write(new String(data, IOUtils.UTF_8));
+ output.write(new String(data, UTF_8));
}
}
@@ -658,7 +658,7 @@ public class IOUtils {
public static void write(char[] data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(new String(data).getBytes(IOUtils.UTF_8));
+ output.write(new String(data).getBytes(UTF_8));
}
}
@@ -784,7 +784,7 @@ public class IOUtils {
public static void write(String data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(data.getBytes(IOUtils.UTF_8));
+ output.write(data.getBytes(UTF_8));
}
}
@@ -853,7 +853,7 @@ public class IOUtils {
public static void write(StringBuffer data, OutputStream output)
throws IOException {
if (data != null) {
- output.write(data.toString().getBytes(IOUtils.UTF_8));
+ output.write(data.toString().getBytes(UTF_8));
}
}
@@ -959,7 +959,7 @@ public class IOUtils {
*/
public static void copy(InputStream input, Writer output)
throws IOException {
- InputStreamReader in = new InputStreamReader(input, IOUtils.UTF_8);
+ InputStreamReader in = new InputStreamReader(input, UTF_8);
copy(in, output);
}
@@ -1066,7 +1066,7 @@ public class IOUtils {
*/
public static void copy(Reader input, OutputStream output)
throws IOException {
- OutputStreamWriter out = new OutputStreamWriter(output, IOUtils.UTF_8);
+ OutputStreamWriter out = new OutputStreamWriter(output, UTF_8);
copy(input, out);
// XXX Unless anyone is planning on rewriting OutputStreamWriter, we
// have to flush here.
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Thu Aug 20 09:40:08 2015
@@ -25,7 +25,7 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
-import org.apache.tika.io.IOUtils;
+import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Identifier of the language that best matches a given content profile.
@@ -77,7 +77,7 @@ public class LanguageIdentifier {
LanguageIdentifier.class.getResourceAsStream(language + PROFILE_SUFFIX);
try {
BufferedReader reader =
- new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+ new BufferedReader(new InputStreamReader(stream, UTF_8));
String line = reader.readLine();
while (line != null) {
if (line.length() > 0 && !line.startsWith("#")) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java Thu Aug 20 09:40:08 2015
@@ -35,7 +35,9 @@ import java.util.List;
import java.util.Map;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* This class runs a ngram analysis over submitted text, results might be used
* for automatic language identification.
@@ -342,7 +344,7 @@ public class LanguageProfilerBuilder {
ngrams.clear();
ngramcounts = new int[maxLength + 1];
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, IOUtils.UTF_8));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8));
String line = null;
while ((line = reader.readLine()) != null) {
@@ -406,7 +408,7 @@ public class LanguageProfilerBuilder {
*/
public void save(OutputStream os) throws IOException {
os.write(("# NgramProfile generated at " + new Date() +
- " for Apache Tika Language Identification\n").getBytes(IOUtils.UTF_8));
+ " for Apache Tika Language Identification\n").getBytes(UTF_8));
// And then each ngram
@@ -433,7 +435,7 @@ public class LanguageProfilerBuilder {
for (int i = 0; i < list.size(); i++) {
NGramEntry e = list.get(i);
String line = e.toString() + " " + e.getCount() + "\n";
- os.write(line.getBytes(IOUtils.UTF_8));
+ os.write(line.getBytes(UTF_8));
}
os.flush();
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java?rev=1696741&r1=1696740&r2=1696741&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java Thu Aug 20 09:40:08 2015
@@ -44,6 +44,8 @@ import org.apache.tika.sax.XHTMLContentH
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* Parser that uses an external program (like catdoc or pdf2txt) to extract
* text content and metadata from a given document.
@@ -236,7 +238,7 @@ public class ExternalParser extends Abst
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
- Reader reader = new InputStreamReader(stream, IOUtils.UTF_8);
+ Reader reader = new InputStreamReader(stream, UTF_8);
try {
xhtml.startDocument();
xhtml.startElement("p");
@@ -307,7 +309,7 @@ public class ExternalParser extends Abst
Thread t = new Thread() {
public void run() {
BufferedReader reader;
- reader = new BufferedReader(new InputStreamReader(stream, IOUtils.UTF_8));
+ reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
try {
String line;
while ( (line = reader.readLine()) != null ) {