You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/13 11:21:46 UTC
svn commit: r1034718 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/extractor/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/
Author: maxcom
Date: Sat Nov 13 10:21:46 2010
New Revision: 1034718
URL: http://svn.apache.org/viewvc?rev=1034718&view=rev
Log:
OOXMLExtractor: use EmbeddedDocumentExtractor
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java Sat Nov 13 10:21:46 2010
@@ -25,6 +25,7 @@ import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.DelegatingParser;
import org.apache.tika.parser.ParseContext;
@@ -90,7 +91,7 @@ public class ParsingEmbeddedDocumentExtr
// Use the delegate parser to parse this entry
try {
DELEGATING_PARSER.parse(
- new CloseShieldInputStream(stream),
+ TikaInputStream.get(new CloseShieldInputStream(stream)),
new EmbeddedContentHandler(new BodyContentHandler(handler)),
metadata, context);
} catch (TikaException e) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Nov 13 10:21:46 2010
@@ -29,11 +29,11 @@ import org.apache.poi.openxml4j.opc.Pack
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
@@ -56,11 +56,22 @@ public abstract class AbstractOOXMLExtra
protected POIXMLTextExtractor extractor;
+ private final EmbeddedDocumentExtractor embeddedExtractor;
+
private final String type;
- public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
+ public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor, String type) {
this.extractor = extractor;
this.type = type;
+
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex==null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ embeddedExtractor = ex;
+ }
+
}
/**
@@ -138,13 +149,13 @@ public abstract class AbstractOOXMLExtra
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, name);
metadata.set(Metadata.CONTENT_TYPE, type);
-
- Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);
- parser.parse(
- TikaInputStream.get(part.getInputStream()),
- new EmbeddedContentHandler(handler),
- metadata, context
- );
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ TikaInputStream.get(part.getInputStream()),
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
}
/**
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Sat Nov 13 10:21:46 2010
@@ -68,15 +68,15 @@ public class OOXMLExtractorFactory {
POIXMLDocument document = poiExtractor.getDocument();
if (document instanceof XSLFSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(
- (XSLFPowerPointExtractor) poiExtractor);
+ context, (XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XSSFWorkbook) {
extractor = new XSSFExcelExtractorDecorator(
- (XSSFExcelExtractor) poiExtractor, locale);
+ context, (XSSFExcelExtractor) poiExtractor, locale);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(
- (XWPFWordExtractor) poiExtractor);
+ context, (XWPFWordExtractor) poiExtractor);
} else {
- extractor = new POIXMLTextExtractorDecorator(poiExtractor);
+ extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
extractor.getMetadataExtractor().extract(metadata);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -21,13 +21,14 @@ import java.util.List;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
- public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
- super(extractor, null);
+ public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
+ super(context, extractor, null);
}
@Override
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -33,6 +33,7 @@ import org.apache.poi.xslf.usermodel.XSL
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
@@ -49,8 +50,8 @@ import org.xml.sax.SAXException;
public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
- public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
- super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
+ super(context, extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
}
/**
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -43,6 +43,7 @@ import org.apache.poi.xssf.usermodel.XSS
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.SAXException;
@@ -58,8 +59,8 @@ public class XSSFExcelExtractorDecorator
private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
public XSSFExcelExtractorDecorator(
- XSSFExcelExtractor extractor, Locale locale) {
- super(extractor, TYPE);
+ ParseContext context, XSSFExcelExtractor extractor, Locale locale) {
+ super(context, extractor, TYPE);
this.extractor = extractor;
formatter = new DataFormatter(locale);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -39,6 +39,7 @@ import org.apache.poi.xwpf.usermodel.XWP
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -51,8 +52,8 @@ public class XWPFWordExtractorDecorator
private XWPFDocument document;
private XWPFStyles styles;
- public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
- super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
+ super(context, extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
document = (XWPFDocument) extractor.getDocument();
styles = document.getStyles();