You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/02/18 16:34:03 UTC

svn commit: r911443 - in /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft: OfficeParser.java SummaryExtractor.java ooxml/OOXMLExtractorFactory.java ooxml/OOXMLParser.java

Author: jukka
Date: Thu Feb 18 15:34:02 2010
New Revision: 911443

URL: http://svn.apache.org/viewvc?rev=911443&view=rev
Log:
TIKA-378: TikaConfig should notify users if it cannot initialize some parser

Make the Microsoft Office parsers loadable even when the POI libraries are not present

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java   (with props)
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Feb 18 15:34:02 2010
@@ -16,12 +16,10 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Locale;
@@ -29,18 +27,10 @@
 
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hpsf.CustomProperties;
-import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
-import org.apache.poi.hpsf.NoPropertySetStreamException;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
@@ -57,12 +47,6 @@
  */
 public class OfficeParser implements Parser {
 
-    private static final String SUMMARY_INFORMATION =
-        SummaryInformation.DEFAULT_STREAM_NAME;
-
-    private static final String DOCUMENT_SUMMARY_INFORMATION =
-        DocumentSummaryInformation.DEFAULT_STREAM_NAME;
-
     private static final Set<MediaType> SUPPORTED_TYPES =
         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                 MediaType.application("x-tika-msoffice"),
@@ -90,10 +74,7 @@
         POIFSFileSystem filesystem = new POIFSFileSystem(stream);
 
         // Parse summary entries first, to make metadata available early
-        parseSummaryEntryIfExists(
-                filesystem, SUMMARY_INFORMATION, metadata);
-        parseSummaryEntryIfExists(
-                filesystem, DOCUMENT_SUMMARY_INFORMATION, metadata);
+        new SummaryExtractor(metadata).parseSummaries(filesystem);
 
         // Parse remaining document entries
         boolean outlookExtracted = false;
@@ -169,91 +150,10 @@
         parse(stream, handler, metadata, new ParseContext());
     }
 
-    private void parseSummaryEntryIfExists(
-            POIFSFileSystem filesystem, String entryName, Metadata metadata)
-            throws IOException, TikaException {
-        try {
-            DocumentEntry entry =
-                (DocumentEntry) filesystem.getRoot().getEntry(entryName);
-            PropertySet properties =
-                new PropertySet(new DocumentInputStream(entry));
-            if (properties.isSummaryInformation()) {
-                parse(new SummaryInformation(properties), metadata);
-            }
-            if (properties.isDocumentSummaryInformation()) {
-                parse(new DocumentSummaryInformation(properties), metadata);
-            }
-        } catch (FileNotFoundException e) {
-            // entry does not exist, just skip it
-        } catch (NoPropertySetStreamException e) {
-            throw new TikaException("Not a HPSF document", e);
-        } catch (UnexpectedPropertySetTypeException e) {
-            throw new TikaException("Unexpected HPSF document", e);
-        } catch (MarkUnsupportedException e) {
-            throw new TikaException("Invalid DocumentInputStream", e);
-        }
-    }
-
-    private void parse(SummaryInformation summary, Metadata metadata) {
-        set(metadata, Metadata.TITLE, summary.getTitle());
-        set(metadata, Metadata.AUTHOR, summary.getAuthor());
-        set(metadata, Metadata.KEYWORDS, summary.getKeywords());
-        set(metadata, Metadata.SUBJECT, summary.getSubject());
-        set(metadata, Metadata.LAST_AUTHOR, summary.getLastAuthor());
-        set(metadata, Metadata.COMMENTS, summary.getComments());
-        set(metadata, Metadata.TEMPLATE, summary.getTemplate());
-        set(metadata, Metadata.APPLICATION_NAME, summary.getApplicationName());
-        set(metadata, Metadata.REVISION_NUMBER, summary.getRevNumber());
-        set(metadata, Metadata.CREATION_DATE, summary.getCreateDateTime());
-        set(metadata, Metadata.CHARACTER_COUNT, summary.getCharCount());
-        set(metadata, Metadata.EDIT_TIME, summary.getEditTime());
-        set(metadata, Metadata.LAST_SAVED, summary.getLastSaveDateTime());
-        set(metadata, Metadata.PAGE_COUNT, summary.getPageCount());
-        set(metadata, Metadata.SECURITY, summary.getSecurity());
-        set(metadata, Metadata.WORD_COUNT, summary.getWordCount());
-        set(metadata, Metadata.LAST_PRINTED, summary.getLastPrinted());
-    }
-
-    private void parse(DocumentSummaryInformation summary, Metadata metadata) {
-        set(metadata, Metadata.COMPANY, summary.getCompany());
-        set(metadata, Metadata.MANAGER, summary.getManager());
-        set(metadata, Metadata.LANGUAGE, getLanguage(summary));
-        set(metadata, Metadata.CATEGORY, summary.getCategory());
-    }
-
-    private String getLanguage(DocumentSummaryInformation summary) {
-        CustomProperties customProperties = summary.getCustomProperties();
-        if (customProperties != null) {
-            Object value = customProperties.get("Language");
-            if (value instanceof String) {
-                return (String) value;
-            }
-        }
-        return null;
-    }
-
     private void setType(Metadata metadata, String type) {
         metadata.set(Metadata.CONTENT_TYPE, type);
     }
 
-    private void set(Metadata metadata, String name, String value) {
-        if (value != null) {
-            metadata.set(name, value);
-        }
-    }
-
-    private void set(Metadata metadata, String name, Date value) {
-        if (value != null) {
-            metadata.set(name, value.toString());
-        }
-    }
-
-    private void set(Metadata metadata, String name, long value) {
-        if (value > 0) {
-            metadata.set(name, Long.toString(value));
-        }
-    }
-
     /**
      * Outputs a section of text if the given text is non-empty.
      *

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=911443&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Thu Feb 18 15:34:02 2010
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Outlook Message Parser.
+ */
+class SummaryExtractor {
+
+    private static final String SUMMARY_INFORMATION =
+        SummaryInformation.DEFAULT_STREAM_NAME;
+
+    private static final String DOCUMENT_SUMMARY_INFORMATION =
+        DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+    private final Metadata metadata;
+
+    public SummaryExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parseSummaries(POIFSFileSystem filesystem)
+            throws IOException, TikaException {
+        parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
+        parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
+    }
+
+    private void parseSummaryEntryIfExists(
+            POIFSFileSystem filesystem, String entryName)
+            throws IOException, TikaException {
+        try {
+            DocumentEntry entry =
+                (DocumentEntry) filesystem.getRoot().getEntry(entryName);
+            PropertySet properties =
+                new PropertySet(new DocumentInputStream(entry));
+            if (properties.isSummaryInformation()) {
+                parse(new SummaryInformation(properties));
+            }
+            if (properties.isDocumentSummaryInformation()) {
+                parse(new DocumentSummaryInformation(properties));
+            }
+        } catch (FileNotFoundException e) {
+            // entry does not exist, just skip it
+        } catch (NoPropertySetStreamException e) {
+            throw new TikaException("Not a HPSF document", e);
+        } catch (UnexpectedPropertySetTypeException e) {
+            throw new TikaException("Unexpected HPSF document", e);
+        } catch (MarkUnsupportedException e) {
+            throw new TikaException("Invalid DocumentInputStream", e);
+        }
+    }
+
+    private void parse(SummaryInformation summary) {
+        set(Metadata.TITLE, summary.getTitle());
+        set(Metadata.AUTHOR, summary.getAuthor());
+        set(Metadata.KEYWORDS, summary.getKeywords());
+        set(Metadata.SUBJECT, summary.getSubject());
+        set(Metadata.LAST_AUTHOR, summary.getLastAuthor());
+        set(Metadata.COMMENTS, summary.getComments());
+        set(Metadata.TEMPLATE, summary.getTemplate());
+        set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+        set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+        set(Metadata.CREATION_DATE, summary.getCreateDateTime());
+        set(Metadata.CHARACTER_COUNT, summary.getCharCount());
+        set(Metadata.EDIT_TIME, summary.getEditTime());
+        set(Metadata.LAST_SAVED, summary.getLastSaveDateTime());
+        set(Metadata.PAGE_COUNT, summary.getPageCount());
+        set(Metadata.SECURITY, summary.getSecurity());
+        set(Metadata.WORD_COUNT, summary.getWordCount());
+        set(Metadata.LAST_PRINTED, summary.getLastPrinted());
+    }
+
+    private void parse(DocumentSummaryInformation summary) {
+        set(Metadata.COMPANY, summary.getCompany());
+        set(Metadata.MANAGER, summary.getManager());
+        set(Metadata.LANGUAGE, getLanguage(summary));
+        set(Metadata.CATEGORY, summary.getCategory());
+    }
+
+    private String getLanguage(DocumentSummaryInformation summary) {
+        CustomProperties customProperties = summary.getCustomProperties();
+        if (customProperties != null) {
+            Object value = customProperties.get("Language");
+            if (value instanceof String) {
+                return (String) value;
+            }
+        }
+        return null;
+    }
+
+    private void set(String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private void set(String name, Date value) {
+        if (value != null) {
+            metadata.set(name, value.toString());
+        }
+    }
+
+    private void set(String name, long value) {
+        if (value > 0) {
+            metadata.set(name, Long.toString(value));
+        }
+    }
+
+}

Propchange: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Feb 18 15:34:02 2010
@@ -16,16 +16,26 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.IOException;
+import java.io.InputStream;
 import java.util.Locale;
 
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Figures out the correct {@link OOXMLExtractor} for the supplied document and
@@ -33,20 +43,39 @@
  */
 public class OOXMLExtractorFactory {
 
-    public static OOXMLExtractor createExtractor(
-            POIXMLTextExtractor extractor, Locale locale) {
-        POIXMLDocument document = extractor.getDocument();
-
-        if (document instanceof XSLFSlideShow) {
-            return new XSLFPowerPointExtractorDecorator(
-                    (XSLFPowerPointExtractor) extractor);
-        } else if (document instanceof XSSFWorkbook) {
-            return new XSSFExcelExtractorDecorator(
-                    (XSSFExcelExtractor) extractor, locale);
-        } else if (document instanceof XWPFDocument) {
-            return new XWPFWordExtractorDecorator((XWPFWordExtractor) extractor);
-        } else {
-            return new POIXMLTextExtractorDecorator(extractor);
+    public static void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Locale locale)
+            throws IOException, SAXException, TikaException {
+        try {
+            OOXMLExtractor extractor;
+
+            POIXMLTextExtractor poiExtractor =
+                (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+            POIXMLDocument document = poiExtractor.getDocument();
+            if (document instanceof XSLFSlideShow) {
+                extractor = new XSLFPowerPointExtractorDecorator(
+                        (XSLFPowerPointExtractor) poiExtractor);
+            } else if (document instanceof XSSFWorkbook) {
+                extractor = new XSSFExcelExtractorDecorator(
+                        (XSSFExcelExtractor) poiExtractor, locale);
+            } else if (document instanceof XWPFDocument) {
+                extractor = new XWPFWordExtractorDecorator(
+                        (XWPFWordExtractor) poiExtractor);
+            } else {
+                extractor = new POIXMLTextExtractorDecorator(poiExtractor);
+            }
+
+            extractor.getMetadataExtractor().extract(metadata);
+            extractor.getXHTML(handler, metadata);
+        } catch (InvalidFormatException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+        } catch (OpenXML4JException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+        } catch (XmlException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+
         }
     }
+
 }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Thu Feb 18 15:34:02 2010
@@ -24,16 +24,11 @@
 import java.util.Locale;
 import java.util.Set;
 
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.extractor.ExtractorFactory;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -70,20 +65,8 @@
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        try {
-            Locale locale = context.get(Locale.class, Locale.getDefault());
-            OOXMLExtractor extractor = OOXMLExtractorFactory.createExtractor(
-                    (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream),
-                    locale);
-            extractor.getMetadataExtractor().extract(metadata);
-            extractor.getXHTML(handler, metadata);
-        } catch (InvalidFormatException e) {
-            throw new TikaException("Error creating OOXML extractor", e);
-        } catch (OpenXML4JException e) {
-            throw new TikaException("Error creating OOXML extractor", e);
-        } catch (XmlException e) {
-            throw new TikaException("Error creating OOXML extractor", e);
-        }
+        Locale locale = context.get(Locale.class, Locale.getDefault());
+        OOXMLExtractorFactory.parse(stream, handler, metadata, locale);
     }
 
     /**