You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/01 16:55:14 UTC
svn commit: r1164100 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/io/
tika-parsers/src/main/java/org/apache/tika/parser/font/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/main/java/org/apache/tika/parser/pk...
Author: jukka
Date: Thu Sep 1 14:55:13 2011
New Revision: 1164100
URL: http://svn.apache.org/viewvc?rev=1164100&view=rev
Log:
TIKA-701: Fix problems with TemporaryFiles
Add a TikaInputStream.cast() method to simplify conditional code
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Thu Sep 1 14:55:13 2011
@@ -99,6 +99,7 @@ public class TikaInputStream extends Tag
* is expected to explicitly close the original stream when it's no
* longer used.
*
+ * @since Apache Tika 1.0
* @param stream normal input stream
* @return a TikaInputStream instance
*/
@@ -154,6 +155,22 @@ public class TikaInputStream extends Tag
}
/**
+ * Returns the given stream casts to a TikaInputStream, or
+ * <code>null</code> if the stream is not a TikaInputStream.
+ *
+ * @since Apache Tika 1.0
+ * @param stream normal input stream
+ * @return a TikaInputStream instance
+ */
+ public static TikaInputStream cast(InputStream stream) {
+ if (stream instanceof TikaInputStream) {
+ return (TikaInputStream) stream;
+ } else {
+ return null;
+ }
+ }
+
+ /**
* Creates a TikaInputStream from the given array of bytes.
* <p>
* Note that you must always explicitly close the returned stream as in
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java Thu Sep 1 14:55:13 2011
@@ -55,9 +55,9 @@ public class TrueTypeParser extends Abst
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TrueTypeFont font;
- TikaInputStream tis = TikaInputStream.get(stream);
TTFParser parser = new TTFParser();
- if (tis.hasFile()) {
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis != null && tis.hasFile()) {
font = parser.parseTTF(tis.getFile());
} else {
font = parser.parseTTF(stream);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Sep 1 14:55:13 2011
@@ -163,17 +163,15 @@ public class OfficeParser extends Abstra
xhtml.startDocument();
NPOIFSFileSystem filesystem;
- if(stream instanceof TikaInputStream) {
- TikaInputStream tstream = (TikaInputStream)stream;
- if(tstream.getOpenContainer() != null) {
- filesystem = (NPOIFSFileSystem)tstream.getOpenContainer();
- } else if(tstream.hasFile()) {
- filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
- } else {
- filesystem = new NPOIFSFileSystem(tstream);
- }
- } else {
+ TikaInputStream tstream = TikaInputStream.cast(stream);
+ if (tstream == null) {
filesystem = new NPOIFSFileSystem(stream);
+ } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
+ filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
+ } else if (tstream.hasFile()) {
+ filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
+ } else {
+ filesystem = new NPOIFSFileSystem(tstream);
}
// Parse summary entries first, to make metadata available early
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Thu Sep 1 14:55:13 2011
@@ -18,7 +18,6 @@ package org.apache.tika.parser.microsoft
import static org.apache.tika.mime.MediaType.application;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileChannel;
@@ -29,7 +28,6 @@ import java.util.Set;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.detect.Detector;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -92,10 +90,8 @@ public class POIFSContainerDetector impl
}
// We can only detect the exact type when given a TikaInputStream
- if (TikaInputStream.isTikaInputStream(input)) {
- // No TemporaryResources as this is for sure a TikaInputStream
- TikaInputStream tis = TikaInputStream.get(input);
-
+ TikaInputStream tis = TikaInputStream.cast(input);
+ if (tis != null) {
// Look for known top level entry names to detect the document type
Set<String> names = getTopLevelNames(tis);
if (names.contains("Workbook")) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Thu Sep 1 14:55:13 2011
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.pkg;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -30,7 +29,6 @@ import org.apache.poi.openxml4j.opc.Pack
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -66,41 +64,29 @@ public class ZipContainerDetector implem
}
// We can only detect the exact type when given a TikaInputStream
- if (!TikaInputStream.isTikaInputStream(input)) {
- return MediaType.APPLICATION_ZIP;
- }
-
- TemporaryResources tmp = new TemporaryResources();
- ZipFile zip = null;
- try {
- File file = TikaInputStream.get(input, tmp).getFile();
- zip = new ZipFile(file);
-
- MediaType type = detectOpenDocument(zip);
- if (type == null) {
- type = detectOfficeOpenXML(zip, TikaInputStream.get(input));
- }
- if (type == null) {
- type = detectIWork(zip);
- }
- if (type == null && zip.getEntry("META-INF/MANIFEST.MF") != null) {
- type = MediaType.application("java-archive");
- }
- if (type == null) {
- type = MediaType.APPLICATION_ZIP;
- }
- return type;
- } catch (IOException e) {
- return MediaType.APPLICATION_ZIP;
- } finally {
- if (zip!=null) {
- try {
- zip.close();
- } catch (IOException e) {
+ TikaInputStream tis = TikaInputStream.cast(input);
+ if (tis != null) {
+ try {
+ ZipFile zip = new ZipFile(tis.getFile());
+
+ MediaType type = detectOpenDocument(zip);
+ if (type == null) {
+ type = detectOfficeOpenXML(zip, tis);
}
+ if (type == null) {
+ type = detectIWork(zip);
+ }
+ if (type != null) {
+ return type;
+ } else if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
+ return MediaType.application("java-archive");
+ }
+ } catch (IOException ignore) {
}
- tmp.close();
}
+
+ // Fallback: it's still a zip file, we just don't know what kind of one
+ return MediaType.APPLICATION_ZIP;
}
private MediaType detectOpenDocument(ZipFile zip) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1164100&r1=1164099&r2=1164100&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Thu Sep 1 14:55:13 2011
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
+import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@@ -50,29 +51,30 @@ public abstract class AbstractPOIContain
protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
TikaInputStream stream = getTestFile(filename);
- assertEquals(true, extractor.isSupported(stream));
-
- // Process it
- TrackingHandler handler = new TrackingHandler();
- if(recurse) {
- extractor.extract(stream, extractor, handler);
- } else {
- extractor.extract(stream, null, handler);
+ try {
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ if(recurse) {
+ extractor.extract(stream, extractor, handler);
+ } else {
+ extractor.extract(stream, null, handler);
+ }
+
+ // So they can check what happened
+ return handler;
+ } finally {
+ stream.close();
}
-
- // So they can check what happened
- return handler;
}
protected TikaInputStream getTestFile(String filename) throws Exception {
- InputStream input = AbstractPOIContainerExtractionTest.class.getResourceAsStream(
- "/test-documents/" + filename);
+ URL input = AbstractPOIContainerExtractionTest.class.getResource(
+ "/test-documents/" + filename);
assertNotNull(filename + " not found", input);
-
- TikaInputStream stream = TikaInputStream.get(input);
- assertNotNull(stream);
-
- return stream;
+
+ return TikaInputStream.get(input);
}
public static class TrackingHandler implements EmbeddedResourceHandler {