You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/13 11:21:46 UTC

svn commit: r1034718 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/

Author: maxcom
Date: Sat Nov 13 10:21:46 2010
New Revision: 1034718

URL: http://svn.apache.org/viewvc?rev=1034718&view=rev
Log:
OOXMLExtractor: use EmbeddedDocumentExtractor

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java Sat Nov 13 10:21:46 2010
@@ -25,6 +25,7 @@ import java.io.InputStream;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.DelegatingParser;
 import org.apache.tika.parser.ParseContext;
@@ -90,7 +91,7 @@ public class ParsingEmbeddedDocumentExtr
         // Use the delegate parser to parse this entry
         try {
             DELEGATING_PARSER.parse(
-                    new CloseShieldInputStream(stream),
+                    TikaInputStream.get(new CloseShieldInputStream(stream)),
                     new EmbeddedContentHandler(new BodyContentHandler(handler)),
                     metadata, context);
         } catch (TikaException e) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Nov 13 10:21:46 2010
@@ -29,11 +29,11 @@ import org.apache.poi.openxml4j.opc.Pack
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
@@ -56,11 +56,22 @@ public abstract class AbstractOOXMLExtra
    
     protected POIXMLTextExtractor extractor;
 
+    private final EmbeddedDocumentExtractor embeddedExtractor;
+
     private final String type;
 
-    public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
+    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor, String type) {
         this.extractor = extractor;
         this.type = type;
+
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex==null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            embeddedExtractor = ex;
+        }
+
     }
 
     /**
@@ -138,13 +149,13 @@ public abstract class AbstractOOXMLExtra
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
        metadata.set(Metadata.CONTENT_TYPE, type);
-       
-       Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);
-       parser.parse(
-               TikaInputStream.get(part.getInputStream()), 
-               new EmbeddedContentHandler(handler),
-               metadata, context
-       );
+
+       if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+         embeddedExtractor.parseEmbedded(
+                 TikaInputStream.get(part.getInputStream()),
+                 new EmbeddedContentHandler(handler),
+                 metadata, false);
+       }
     }
 
     /**

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Sat Nov 13 10:21:46 2010
@@ -68,15 +68,15 @@ public class OOXMLExtractorFactory {
             POIXMLDocument document = poiExtractor.getDocument();
             if (document instanceof XSLFSlideShow) {
                 extractor = new XSLFPowerPointExtractorDecorator(
-                        (XSLFPowerPointExtractor) poiExtractor);
+                        context, (XSLFPowerPointExtractor) poiExtractor);
             } else if (document instanceof XSSFWorkbook) {
                 extractor = new XSSFExcelExtractorDecorator(
-                        (XSSFExcelExtractor) poiExtractor, locale);
+                        context, (XSSFExcelExtractor) poiExtractor, locale);
             } else if (document instanceof XWPFDocument) {
                 extractor = new XWPFWordExtractorDecorator(
-                        (XWPFWordExtractor) poiExtractor);
+                        context, (XWPFWordExtractor) poiExtractor);
             } else {
-                extractor = new POIXMLTextExtractorDecorator(poiExtractor);
+                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
             }
 
             extractor.getMetadataExtractor().extract(metadata);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -21,13 +21,14 @@ import java.util.List;
 
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
 public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
 
-    public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
-        super(extractor, null);
+    public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
+        super(context, extractor, null);
     }
 
     @Override

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -33,6 +33,7 @@ import org.apache.poi.xslf.usermodel.XSL
 import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
 import org.apache.poi.xslf.usermodel.DrawingParagraph;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
@@ -49,8 +50,8 @@ import org.xml.sax.SAXException;
 
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
-    public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
-        super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
+        super(context, extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
     }
 
     /**

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -43,6 +43,7 @@ import org.apache.poi.xssf.usermodel.XSS
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.SAXException;
@@ -58,8 +59,8 @@ public class XSSFExcelExtractorDecorator
     private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
 
     public XSSFExcelExtractorDecorator(
-            XSSFExcelExtractor extractor, Locale locale) {
-        super(extractor, TYPE);
+            ParseContext context, XSSFExcelExtractor extractor, Locale locale) {
+        super(context, extractor, TYPE);
 
         this.extractor = extractor;
         formatter = new DataFormatter(locale);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1034718&r1=1034717&r2=1034718&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sat Nov 13 10:21:46 2010
@@ -39,6 +39,7 @@ import org.apache.poi.xwpf.usermodel.XWP
 import org.apache.poi.xwpf.usermodel.XWPFTable;
 import org.apache.poi.xwpf.usermodel.XWPFTableCell;
 import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -51,8 +52,8 @@ public class XWPFWordExtractorDecorator 
     private XWPFDocument document;
     private XWPFStyles styles;
 
-    public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
-        super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
+        super(context, extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
         
         document = (XWPFDocument) extractor.getDocument();
         styles = document.getStyles();