You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/11 13:52:38 UTC
svn commit: r1091046 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/detect/ main/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/detect/
Author: nick
Date: Mon Apr 11 11:52:37 2011
New Revision: 1091046
URL: http://svn.apache.org/viewvc?rev=1091046&view=rev
Log:
TIKA-622 - Switch the POI based parser from the old POIFS to the new, lower memory NPOIFS
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java Mon Apr 11 11:52:37 2011
@@ -28,7 +28,7 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TaggedInputStream;
import org.apache.tika.io.TikaInputStream;
@@ -141,9 +141,13 @@ public class POIFSContainerDetector impl
TaggedInputStream tagged = new TaggedInputStream(
new BufferedInputStream(new FileInputStream(file)));
try {
- // POIFSFileSystem might try close the stream
- POIFSFileSystem fs =
- new POIFSFileSystem(new CloseShieldInputStream(tagged));
+ NPOIFSFileSystem fs;
+ if (stream.hasFile()) {
+ fs = new NPOIFSFileSystem(stream.getFile());
+ } else {
+ // Load from a stream, but prevent the stream being closed
+ fs = new NPOIFSFileSystem(new CloseShieldInputStream(tagged));
+ }
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Apr 11 11:52:37 2011
@@ -59,7 +59,7 @@ import org.apache.poi.hssf.record.common
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
@@ -130,7 +130,7 @@ public class ExcelExtractor extends Abst
* or writing the extracted content
*/
protected void parse(
- POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(filesystem, isListenForAllRecords());
@@ -243,7 +243,7 @@ public class ExcelExtractor extends Abst
* @throws IOException on any IO errors.
* @throws SAXException on any SAX parsing errors.
*/
- public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
+ public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
// Set up listener and register the records we want to process
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Mon Apr 11 11:52:37 2011
@@ -16,25 +16,25 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.IOException;
+import java.util.List;
+
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.model.OLEShape;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.util.List;
-
public class HSLFExtractor extends AbstractPOIFSExtractor {
public HSLFExtractor(ParseContext context) {
super(context);
}
protected void parse(
- POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
PowerPointExtractor powerPointExtractor =
new PowerPointExtractor(filesystem);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Apr 11 11:52:37 2011
@@ -19,7 +19,11 @@ package org.apache.tika.parser.microsoft
import java.io.IOException;
import java.io.InputStream;
import java.security.GeneralSecurityException;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
@@ -27,6 +31,7 @@ import org.apache.poi.poifs.crypt.Decryp
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -95,6 +100,10 @@ public class OfficeParser extends Abstra
return detectType(fs.getRoot());
}
+ public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+ return detectType(fs.getRoot());
+ }
+
public static POIFSDocumentType detectType(DirectoryEntry node) {
for (Entry entry : node) {
POIFSDocumentType type = detectType(entry);
@@ -154,12 +163,18 @@ public class OfficeParser extends Abstra
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- POIFSFileSystem filesystem;
- if(stream instanceof TikaInputStream &&
- ((TikaInputStream)stream).getOpenContainer() != null) {
- filesystem = (POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
+ NPOIFSFileSystem filesystem;
+ if(stream instanceof TikaInputStream) {
+ TikaInputStream tstream = (TikaInputStream)stream;
+ if(tstream.getOpenContainer() != null) {
+ filesystem = (NPOIFSFileSystem)tstream.getOpenContainer();
+ } else if(tstream.hasFile()) {
+ filesystem = new NPOIFSFileSystem(tstream.getFile());
+ } else {
+ filesystem = new NPOIFSFileSystem(tstream);
+ }
} else {
- filesystem = new POIFSFileSystem(stream);
+ filesystem = new NPOIFSFileSystem(stream);
}
// Parse summary entries first, to make metadata available early
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Mon Apr 11 11:52:37 2011
@@ -29,7 +29,7 @@ import org.apache.poi.hpsf.SummaryInform
import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
@@ -52,14 +52,14 @@ class SummaryExtractor {
this.metadata = metadata;
}
- public void parseSummaries(POIFSFileSystem filesystem)
+ public void parseSummaries(NPOIFSFileSystem filesystem)
throws IOException, TikaException {
parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
}
private void parseSummaryEntryIfExists(
- POIFSFileSystem filesystem, String entryName)
+ NPOIFSFileSystem filesystem, String entryName)
throws IOException, TikaException {
try {
DocumentEntry entry =
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Apr 11 11:52:37 2011
@@ -40,7 +40,7 @@ import org.apache.poi.hwpf.usermodel.Tab
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
@@ -54,11 +54,11 @@ public class WordExtractor extends Abstr
}
protected void parse(
- POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
- document = new HWPFDocument(filesystem);
+ document = new HWPFDocument(filesystem.getRoot());
} catch(OldWordFileFormatException e) {
parseWord6(filesystem, xhtml);
return;
@@ -345,9 +345,9 @@ public class WordExtractor extends Abstr
}
protected void parseWord6(
- POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
- HWPFOldDocument doc = new HWPFOldDocument(filesystem);
+ HWPFOldDocument doc = new HWPFOldDocument(filesystem.getRoot());
Word6Extractor extractor = new Word6Extractor(doc);
for(String p : extractor.getParagraphText()) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1091046&r1=1091045&r2=1091046&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Apr 11 11:52:37 2011
@@ -21,7 +21,7 @@ import java.io.InputStream;
import junit.framework.TestCase;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -70,7 +70,7 @@ public class TestContainerAwareDetector
assertEquals(
MediaType.parse("application/vnd.ms-powerpoint"),
detector.detect(stream, new Metadata()));
- assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem);
+ assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
} finally {
stream.close();
}