You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/02/18 16:34:03 UTC
svn commit: r911443 - in
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft:
OfficeParser.java SummaryExtractor.java ooxml/OOXMLExtractorFactory.java
ooxml/OOXMLParser.java
Author: jukka
Date: Thu Feb 18 15:34:02 2010
New Revision: 911443
URL: http://svn.apache.org/viewvc?rev=911443&view=rev
Log:
TIKA-378: TikaConfig should notify users if it cannot initialize some parser
Make the Microsoft Office parsers loadable even when the POI libraries are not present
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (with props)
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Thu Feb 18 15:34:02 2010
@@ -16,12 +16,10 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
-import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
@@ -29,18 +27,10 @@
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hpsf.CustomProperties;
-import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
-import org.apache.poi.hpsf.NoPropertySetStreamException;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
@@ -57,12 +47,6 @@
*/
public class OfficeParser implements Parser {
- private static final String SUMMARY_INFORMATION =
- SummaryInformation.DEFAULT_STREAM_NAME;
-
- private static final String DOCUMENT_SUMMARY_INFORMATION =
- DocumentSummaryInformation.DEFAULT_STREAM_NAME;
-
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("x-tika-msoffice"),
@@ -90,10 +74,7 @@
POIFSFileSystem filesystem = new POIFSFileSystem(stream);
// Parse summary entries first, to make metadata available early
- parseSummaryEntryIfExists(
- filesystem, SUMMARY_INFORMATION, metadata);
- parseSummaryEntryIfExists(
- filesystem, DOCUMENT_SUMMARY_INFORMATION, metadata);
+ new SummaryExtractor(metadata).parseSummaries(filesystem);
// Parse remaining document entries
boolean outlookExtracted = false;
@@ -169,91 +150,10 @@
parse(stream, handler, metadata, new ParseContext());
}
- private void parseSummaryEntryIfExists(
- POIFSFileSystem filesystem, String entryName, Metadata metadata)
- throws IOException, TikaException {
- try {
- DocumentEntry entry =
- (DocumentEntry) filesystem.getRoot().getEntry(entryName);
- PropertySet properties =
- new PropertySet(new DocumentInputStream(entry));
- if (properties.isSummaryInformation()) {
- parse(new SummaryInformation(properties), metadata);
- }
- if (properties.isDocumentSummaryInformation()) {
- parse(new DocumentSummaryInformation(properties), metadata);
- }
- } catch (FileNotFoundException e) {
- // entry does not exist, just skip it
- } catch (NoPropertySetStreamException e) {
- throw new TikaException("Not a HPSF document", e);
- } catch (UnexpectedPropertySetTypeException e) {
- throw new TikaException("Unexpected HPSF document", e);
- } catch (MarkUnsupportedException e) {
- throw new TikaException("Invalid DocumentInputStream", e);
- }
- }
-
- private void parse(SummaryInformation summary, Metadata metadata) {
- set(metadata, Metadata.TITLE, summary.getTitle());
- set(metadata, Metadata.AUTHOR, summary.getAuthor());
- set(metadata, Metadata.KEYWORDS, summary.getKeywords());
- set(metadata, Metadata.SUBJECT, summary.getSubject());
- set(metadata, Metadata.LAST_AUTHOR, summary.getLastAuthor());
- set(metadata, Metadata.COMMENTS, summary.getComments());
- set(metadata, Metadata.TEMPLATE, summary.getTemplate());
- set(metadata, Metadata.APPLICATION_NAME, summary.getApplicationName());
- set(metadata, Metadata.REVISION_NUMBER, summary.getRevNumber());
- set(metadata, Metadata.CREATION_DATE, summary.getCreateDateTime());
- set(metadata, Metadata.CHARACTER_COUNT, summary.getCharCount());
- set(metadata, Metadata.EDIT_TIME, summary.getEditTime());
- set(metadata, Metadata.LAST_SAVED, summary.getLastSaveDateTime());
- set(metadata, Metadata.PAGE_COUNT, summary.getPageCount());
- set(metadata, Metadata.SECURITY, summary.getSecurity());
- set(metadata, Metadata.WORD_COUNT, summary.getWordCount());
- set(metadata, Metadata.LAST_PRINTED, summary.getLastPrinted());
- }
-
- private void parse(DocumentSummaryInformation summary, Metadata metadata) {
- set(metadata, Metadata.COMPANY, summary.getCompany());
- set(metadata, Metadata.MANAGER, summary.getManager());
- set(metadata, Metadata.LANGUAGE, getLanguage(summary));
- set(metadata, Metadata.CATEGORY, summary.getCategory());
- }
-
- private String getLanguage(DocumentSummaryInformation summary) {
- CustomProperties customProperties = summary.getCustomProperties();
- if (customProperties != null) {
- Object value = customProperties.get("Language");
- if (value instanceof String) {
- return (String) value;
- }
- }
- return null;
- }
-
private void setType(Metadata metadata, String type) {
metadata.set(Metadata.CONTENT_TYPE, type);
}
- private void set(Metadata metadata, String name, String value) {
- if (value != null) {
- metadata.set(name, value);
- }
- }
-
- private void set(Metadata metadata, String name, Date value) {
- if (value != null) {
- metadata.set(name, value.toString());
- }
- }
-
- private void set(Metadata metadata, String name, long value) {
- if (value > 0) {
- metadata.set(name, Long.toString(value));
- }
- }
-
/**
* Outputs a section of text if the given text is non-empty.
*
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=911443&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Thu Feb 18 15:34:02 2010
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Outlook Message Parser.
+ */
+class SummaryExtractor {
+
+ private static final String SUMMARY_INFORMATION =
+ SummaryInformation.DEFAULT_STREAM_NAME;
+
+ private static final String DOCUMENT_SUMMARY_INFORMATION =
+ DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+ private final Metadata metadata;
+
+ public SummaryExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parseSummaries(POIFSFileSystem filesystem)
+ throws IOException, TikaException {
+ parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
+ parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
+ }
+
+ private void parseSummaryEntryIfExists(
+ POIFSFileSystem filesystem, String entryName)
+ throws IOException, TikaException {
+ try {
+ DocumentEntry entry =
+ (DocumentEntry) filesystem.getRoot().getEntry(entryName);
+ PropertySet properties =
+ new PropertySet(new DocumentInputStream(entry));
+ if (properties.isSummaryInformation()) {
+ parse(new SummaryInformation(properties));
+ }
+ if (properties.isDocumentSummaryInformation()) {
+ parse(new DocumentSummaryInformation(properties));
+ }
+ } catch (FileNotFoundException e) {
+ // entry does not exist, just skip it
+ } catch (NoPropertySetStreamException e) {
+ throw new TikaException("Not a HPSF document", e);
+ } catch (UnexpectedPropertySetTypeException e) {
+ throw new TikaException("Unexpected HPSF document", e);
+ } catch (MarkUnsupportedException e) {
+ throw new TikaException("Invalid DocumentInputStream", e);
+ }
+ }
+
+ private void parse(SummaryInformation summary) {
+ set(Metadata.TITLE, summary.getTitle());
+ set(Metadata.AUTHOR, summary.getAuthor());
+ set(Metadata.KEYWORDS, summary.getKeywords());
+ set(Metadata.SUBJECT, summary.getSubject());
+ set(Metadata.LAST_AUTHOR, summary.getLastAuthor());
+ set(Metadata.COMMENTS, summary.getComments());
+ set(Metadata.TEMPLATE, summary.getTemplate());
+ set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+ set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+ set(Metadata.CREATION_DATE, summary.getCreateDateTime());
+ set(Metadata.CHARACTER_COUNT, summary.getCharCount());
+ set(Metadata.EDIT_TIME, summary.getEditTime());
+ set(Metadata.LAST_SAVED, summary.getLastSaveDateTime());
+ set(Metadata.PAGE_COUNT, summary.getPageCount());
+ set(Metadata.SECURITY, summary.getSecurity());
+ set(Metadata.WORD_COUNT, summary.getWordCount());
+ set(Metadata.LAST_PRINTED, summary.getLastPrinted());
+ }
+
+ private void parse(DocumentSummaryInformation summary) {
+ set(Metadata.COMPANY, summary.getCompany());
+ set(Metadata.MANAGER, summary.getManager());
+ set(Metadata.LANGUAGE, getLanguage(summary));
+ set(Metadata.CATEGORY, summary.getCategory());
+ }
+
+ private String getLanguage(DocumentSummaryInformation summary) {
+ CustomProperties customProperties = summary.getCustomProperties();
+ if (customProperties != null) {
+ Object value = customProperties.get("Language");
+ if (value instanceof String) {
+ return (String) value;
+ }
+ }
+ return null;
+ }
+
+ private void set(String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void set(String name, Date value) {
+ if (value != null) {
+ metadata.set(name, value.toString());
+ }
+ }
+
+ private void set(String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+
+}
Propchange: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Feb 18 15:34:02 2010
@@ -16,16 +16,26 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.IOException;
+import java.io.InputStream;
import java.util.Locale;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Figures out the correct {@link OOXMLExtractor} for the supplied document and
@@ -33,20 +43,39 @@
*/
public class OOXMLExtractorFactory {
- public static OOXMLExtractor createExtractor(
- POIXMLTextExtractor extractor, Locale locale) {
- POIXMLDocument document = extractor.getDocument();
-
- if (document instanceof XSLFSlideShow) {
- return new XSLFPowerPointExtractorDecorator(
- (XSLFPowerPointExtractor) extractor);
- } else if (document instanceof XSSFWorkbook) {
- return new XSSFExcelExtractorDecorator(
- (XSSFExcelExtractor) extractor, locale);
- } else if (document instanceof XWPFDocument) {
- return new XWPFWordExtractorDecorator((XWPFWordExtractor) extractor);
- } else {
- return new POIXMLTextExtractorDecorator(extractor);
+ public static void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, Locale locale)
+ throws IOException, SAXException, TikaException {
+ try {
+ OOXMLExtractor extractor;
+
+ POIXMLTextExtractor poiExtractor =
+ (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+ POIXMLDocument document = poiExtractor.getDocument();
+ if (document instanceof XSLFSlideShow) {
+ extractor = new XSLFPowerPointExtractorDecorator(
+ (XSLFPowerPointExtractor) poiExtractor);
+ } else if (document instanceof XSSFWorkbook) {
+ extractor = new XSSFExcelExtractorDecorator(
+ (XSSFExcelExtractor) poiExtractor, locale);
+ } else if (document instanceof XWPFDocument) {
+ extractor = new XWPFWordExtractorDecorator(
+ (XWPFWordExtractor) poiExtractor);
+ } else {
+ extractor = new POIXMLTextExtractorDecorator(poiExtractor);
+ }
+
+ extractor.getMetadataExtractor().extract(metadata);
+ extractor.getXHTML(handler, metadata);
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+
}
}
+
}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=911443&r1=911442&r2=911443&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Thu Feb 18 15:34:02 2010
@@ -24,16 +24,11 @@
import java.util.Locale;
import java.util.Set;
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.extractor.ExtractorFactory;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -70,20 +65,8 @@
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- try {
- Locale locale = context.get(Locale.class, Locale.getDefault());
- OOXMLExtractor extractor = OOXMLExtractorFactory.createExtractor(
- (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream),
- locale);
- extractor.getMetadataExtractor().extract(metadata);
- extractor.getXHTML(handler, metadata);
- } catch (InvalidFormatException e) {
- throw new TikaException("Error creating OOXML extractor", e);
- } catch (OpenXML4JException e) {
- throw new TikaException("Error creating OOXML extractor", e);
- } catch (XmlException e) {
- throw new TikaException("Error creating OOXML extractor", e);
- }
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ OOXMLExtractorFactory.parse(stream, handler, metadata, locale);
}
/**