You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/02 16:40:56 UTC
svn commit: r991955 [5/6] - in /tika/trunk:
tika-core/src/test/java/org/apache/tika/
tika-core/src/test/java/org/apache/tika/detect/
tika-core/src/test/java/org/apache/tika/language/
tika-core/src/test/java/org/apache/tika/sax/ tika-core/src/test/resou...
Propchange: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/testlargerbuffer.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Thu Sep 2 14:40:55 2010
@@ -1,62 +1,62 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.jpeg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class JpegParser implements Parser {
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.image("jpeg"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- /**
- * @deprecated This method will be removed in Apache Tika 1.0.
- */
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- parse(stream, handler, metadata, new ParseContext());
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- new JpegExtractor(metadata).parse(stream);
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JpegParser implements Parser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("jpeg"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ new JpegExtractor(metadata).parse(stream);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Thu Sep 2 14:40:55 2010
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.xmlbeans.XmlException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Base class for all Tika OOXML extractors.
- *
- * Tika extractors decorate POI extractors so that the parsed content of
- * documents is returned as a sequence of XHTML SAX events. Subclasses must
- * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
- * populates the {@link XHTMLContentHandler} object received as parameter.
- */
-public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
- protected POIXMLTextExtractor extractor;
-
- private final String type;
-
- public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
- this.extractor = extractor;
- this.type = type;
- }
-
- /**
- * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
- */
- public POIXMLDocument getDocument() {
- return extractor.getDocument();
- }
-
- /**
- * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
- */
- public MetadataExtractor getMetadataExtractor() {
- return new MetadataExtractor(extractor, type);
- }
-
- /**
- * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
- * org.apache.tika.metadata.Metadata)
- */
- public void getXHTML(ContentHandler handler, Metadata metadata)
- throws SAXException, XmlException, IOException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- buildXHTML(xhtml);
- xhtml.endDocument();
- }
-
- /**
- * Populates the {@link XHTMLContentHandler} object received as parameter.
- */
- protected abstract void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Base class for all Tika OOXML extractors.
+ *
+ * Tika extractors decorate POI extractors so that the parsed content of
+ * documents is returned as a sequence of XHTML SAX events. Subclasses must
+ * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
+ * populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+ protected POIXMLTextExtractor extractor;
+
+ private final String type;
+
+ public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
+ this.extractor = extractor;
+ this.type = type;
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
+ */
+ public POIXMLDocument getDocument() {
+ return extractor.getDocument();
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
+ */
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor, type);
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
+ * org.apache.tika.metadata.Metadata)
+ */
+ public void getXHTML(ContentHandler handler, Metadata metadata)
+ throws SAXException, XmlException, IOException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ buildXHTML(xhtml);
+ xhtml.endDocument();
+ }
+
+ /**
+ * Populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+ protected abstract void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException;
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Thu Sep 2 14:40:55 2010
@@ -1,147 +1,147 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.util.Date;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.POIXMLProperties.CoreProperties;
-import org.apache.poi.POIXMLProperties.ExtendedProperties;
-import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
-import org.apache.poi.openxml4j.util.Nullable;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
-
-/**
- * OOXML metadata extractor.
- *
- * Currently POI doesn't support metadata extraction for OOXML.
- *
- * @see OOXMLExtractor#getMetadataExtractor()
- */
-public class MetadataExtractor {
-
- private final POIXMLTextExtractor extractor;
-
- private final String type;
-
- public MetadataExtractor(POIXMLTextExtractor extractor, String type) {
- this.extractor = extractor;
- this.type = type;
- }
-
- public void extract(Metadata metadata) throws TikaException {
- addProperty(metadata, Metadata.CONTENT_TYPE, type);
- extractMetadata(extractor.getCoreProperties(), metadata);
- extractMetadata(extractor.getExtendedProperties(), metadata);
- }
-
- private void extractMetadata(CoreProperties properties, Metadata metadata) {
- PackagePropertiesPart propsHolder = properties
- .getUnderlyingProperties();
-
- addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
- addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
- .getContentStatusProperty());
- addProperty(metadata, Metadata.DATE, propsHolder
- .getCreatedPropertyString());
- addProperty(metadata, Metadata.CREATION_DATE, propsHolder
- .getCreatedProperty());
- addProperty(metadata, Metadata.CREATOR, propsHolder
- .getCreatorProperty());
- addProperty(metadata, Metadata.AUTHOR, propsHolder
- .getCreatorProperty());
- addProperty(metadata, Metadata.DESCRIPTION, propsHolder
- .getDescriptionProperty());
- addProperty(metadata, Metadata.IDENTIFIER, propsHolder
- .getIdentifierProperty());
- addProperty(metadata, Metadata.KEYWORDS, propsHolder
- .getKeywordsProperty());
- addProperty(metadata, Metadata.LANGUAGE, propsHolder
- .getLanguageProperty());
- addProperty(metadata, Metadata.LAST_AUTHOR, propsHolder
- .getLastModifiedByProperty());
- addProperty(metadata, Metadata.LAST_PRINTED, propsHolder
- .getLastPrintedPropertyString());
- addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
- .getModifiedProperty());
- addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
- .getRevisionProperty());
- addProperty(metadata, Metadata.SUBJECT, propsHolder
- .getSubjectProperty());
- addProperty(metadata, Metadata.TITLE, propsHolder.getTitleProperty());
- addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
- }
-
- private void extractMetadata(ExtendedProperties properties,
- Metadata metadata) {
- CTProperties propsHolder = properties.getUnderlyingProperties();
-
- addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder
- .getApplication());
- addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder
- .getAppVersion());
- addProperty(metadata, Metadata.CHARACTER_COUNT, propsHolder
- .getCharacters());
- addProperty(metadata, Metadata.CHARACTER_COUNT_WITH_SPACES, propsHolder
- .getCharactersWithSpaces());
- addProperty(metadata, Metadata.PUBLISHER, propsHolder.getCompany());
- addProperty(metadata, Metadata.LINE_COUNT, propsHolder.getLines());
- addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
- addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
- addProperty(metadata, Metadata.PAGE_COUNT, propsHolder.getPages());
- if (propsHolder.getPages() > 0) {
- metadata.set(PagedText.N_PAGES, propsHolder.getPages());
- } else if (propsHolder.getSlides() > 0) {
- metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
- }
- addProperty(metadata, Metadata.PARAGRAPH_COUNT, propsHolder.getParagraphs());
- addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder
- .getPresentationFormat());
- addProperty(metadata, Metadata.SLIDE_COUNT, propsHolder.getSlides());
- addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
- addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
- addProperty(metadata, Metadata.WORD_COUNT, propsHolder.getWords());
- }
-
- private void addProperty(Metadata metadata, Property property, Nullable<Date> value) {
- if (value.getValue() != null) {
- metadata.set(property, value.getValue());
- }
- }
-
- private void addProperty(Metadata metadata, String name, Nullable<?> value) {
- if (value.getValue() != null) {
- addProperty(metadata, name, value.getValue().toString());
- }
- }
-
- private void addProperty(Metadata metadata, String name, String value) {
- if (value != null) {
- metadata.set(name, value);
- }
- }
-
- private void addProperty(Metadata metadata, String name, long value) {
- if (value > 0) {
- metadata.set(name, Long.toString(value));
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.Date;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
+import org.apache.poi.openxml4j.util.Nullable;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+
+/**
+ * OOXML metadata extractor.
+ *
+ * Currently POI doesn't support metadata extraction for OOXML.
+ *
+ * @see OOXMLExtractor#getMetadataExtractor()
+ */
+public class MetadataExtractor {
+
+ private final POIXMLTextExtractor extractor;
+
+ private final String type;
+
+ public MetadataExtractor(POIXMLTextExtractor extractor, String type) {
+ this.extractor = extractor;
+ this.type = type;
+ }
+
+ public void extract(Metadata metadata) throws TikaException {
+ addProperty(metadata, Metadata.CONTENT_TYPE, type);
+ extractMetadata(extractor.getCoreProperties(), metadata);
+ extractMetadata(extractor.getExtendedProperties(), metadata);
+ }
+
+ private void extractMetadata(CoreProperties properties, Metadata metadata) {
+ PackagePropertiesPart propsHolder = properties
+ .getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+ addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ addProperty(metadata, Metadata.DATE, propsHolder
+ .getCreatedPropertyString());
+ addProperty(metadata, Metadata.CREATION_DATE, propsHolder
+ .getCreatedProperty());
+ addProperty(metadata, Metadata.CREATOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.AUTHOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.DESCRIPTION, propsHolder
+ .getDescriptionProperty());
+ addProperty(metadata, Metadata.IDENTIFIER, propsHolder
+ .getIdentifierProperty());
+ addProperty(metadata, Metadata.KEYWORDS, propsHolder
+ .getKeywordsProperty());
+ addProperty(metadata, Metadata.LANGUAGE, propsHolder
+ .getLanguageProperty());
+ addProperty(metadata, Metadata.LAST_AUTHOR, propsHolder
+ .getLastModifiedByProperty());
+ addProperty(metadata, Metadata.LAST_PRINTED, propsHolder
+ .getLastPrintedPropertyString());
+ addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
+ .getModifiedProperty());
+ addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+ .getRevisionProperty());
+ addProperty(metadata, Metadata.SUBJECT, propsHolder
+ .getSubjectProperty());
+ addProperty(metadata, Metadata.TITLE, propsHolder.getTitleProperty());
+ addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
+ }
+
+ private void extractMetadata(ExtendedProperties properties,
+ Metadata metadata) {
+ CTProperties propsHolder = properties.getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder
+ .getApplication());
+ addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder
+ .getAppVersion());
+ addProperty(metadata, Metadata.CHARACTER_COUNT, propsHolder
+ .getCharacters());
+ addProperty(metadata, Metadata.CHARACTER_COUNT_WITH_SPACES, propsHolder
+ .getCharactersWithSpaces());
+ addProperty(metadata, Metadata.PUBLISHER, propsHolder.getCompany());
+ addProperty(metadata, Metadata.LINE_COUNT, propsHolder.getLines());
+ addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+ addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+ addProperty(metadata, Metadata.PAGE_COUNT, propsHolder.getPages());
+ if (propsHolder.getPages() > 0) {
+ metadata.set(PagedText.N_PAGES, propsHolder.getPages());
+ } else if (propsHolder.getSlides() > 0) {
+ metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
+ }
+ addProperty(metadata, Metadata.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+ addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder
+ .getPresentationFormat());
+ addProperty(metadata, Metadata.SLIDE_COUNT, propsHolder.getSlides());
+ addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+ addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, Metadata.WORD_COUNT, propsHolder.getWords());
+ }
+
+ private void addProperty(Metadata metadata, Property property, Nullable<Date> value) {
+ if (value.getValue() != null) {
+ metadata.set(property, value.getValue());
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, Nullable<?> value) {
+ if (value.getValue() != null) {
+ addProperty(metadata, name, value.getValue().toString());
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Thu Sep 2 14:40:55 2010
@@ -1,53 +1,53 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.tika.metadata.Metadata;
-import org.apache.xmlbeans.XmlException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Interface implemented by all Tika OOXML extractors.
- *
- * @see org.apache.poi.POIXMLTextExtractor
- */
-public interface OOXMLExtractor {
-
- /**
- * Returns the opened document.
- *
- * @see POIXMLTextExtractor#getDocument()
- */
- POIXMLDocument getDocument();
-
- /**
- * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
- * for OOXML by POI.
- */
- MetadataExtractor getMetadataExtractor();
-
- /**
- * Parses the document into a sequence of XHTML SAX events sent to the
- * given content handler.
- */
- void getXHTML(ContentHandler handler, Metadata metadata)
- throws SAXException, XmlException, IOException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Interface implemented by all Tika OOXML extractors.
+ *
+ * @see org.apache.poi.POIXMLTextExtractor
+ */
+public interface OOXMLExtractor {
+
+ /**
+ * Returns the opened document.
+ *
+ * @see POIXMLTextExtractor#getDocument()
+ */
+ POIXMLDocument getDocument();
+
+ /**
+ * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
+ * for OOXML by POI.
+ */
+ MetadataExtractor getMetadataExtractor();
+
+ /**
+ * Parses the document into a sequence of XHTML SAX events sent to the
+ * given content handler.
+ */
+ void getXHTML(ContentHandler handler, Metadata metadata)
+ throws SAXException, XmlException, IOException;
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Thu Sep 2 14:40:55 2010
@@ -1,99 +1,99 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Locale;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.extractor.ExtractorFactory;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.xslf.XSLFSlideShow;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.xmlbeans.XmlException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Figures out the correct {@link OOXMLExtractor} for the supplied document and
- * returns it.
- */
-public class OOXMLExtractorFactory {
-
- public static void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, Locale locale)
- throws IOException, SAXException, TikaException {
- try {
- OOXMLExtractor extractor;
-
- POIXMLTextExtractor poiExtractor;
- if(stream instanceof TikaInputStream &&
- ((TikaInputStream)stream).getOpenContainer() != null) {
- poiExtractor = ExtractorFactory.createExtractor(
- (OPCPackage)((TikaInputStream)stream).getOpenContainer()
- );
- } else {
- poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
- }
-
- POIXMLDocument document = poiExtractor.getDocument();
- if (document instanceof XSLFSlideShow) {
- extractor = new XSLFPowerPointExtractorDecorator(
- (XSLFPowerPointExtractor) poiExtractor);
- } else if (document instanceof XSSFWorkbook) {
- extractor = new XSSFExcelExtractorDecorator(
- (XSSFExcelExtractor) poiExtractor, locale);
- } else if (document instanceof XWPFDocument) {
- extractor = new XWPFWordExtractorDecorator(
- (XWPFWordExtractor) poiExtractor);
- } else {
- extractor = new POIXMLTextExtractorDecorator(poiExtractor);
- }
-
- extractor.getMetadataExtractor().extract(metadata);
- extractor.getXHTML(handler, metadata);
- } catch (IllegalArgumentException e) {
- if (e.getMessage().startsWith("No supported documents found")) {
- throw new TikaException(
- "TIKA-418: RuntimeException while getting content"
- + " for thmx and xps file types", e);
- } else {
- throw new TikaException("Error creating OOXML extractor", e);
- }
- } catch (InvalidFormatException e) {
- throw new TikaException("Error creating OOXML extractor", e);
- } catch (OpenXML4JException e) {
- throw new TikaException("Error creating OOXML extractor", e);
- } catch (XmlException e) {
- throw new TikaException("Error creating OOXML extractor", e);
-
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Figures out the correct {@link OOXMLExtractor} for the supplied document and
+ * returns it.
+ */
+public class OOXMLExtractorFactory {
+
+ public static void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, Locale locale)
+ throws IOException, SAXException, TikaException {
+ try {
+ OOXMLExtractor extractor;
+
+ POIXMLTextExtractor poiExtractor;
+ if(stream instanceof TikaInputStream &&
+ ((TikaInputStream)stream).getOpenContainer() != null) {
+ poiExtractor = ExtractorFactory.createExtractor(
+ (OPCPackage)((TikaInputStream)stream).getOpenContainer()
+ );
+ } else {
+ poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+ }
+
+ POIXMLDocument document = poiExtractor.getDocument();
+ if (document instanceof XSLFSlideShow) {
+ extractor = new XSLFPowerPointExtractorDecorator(
+ (XSLFPowerPointExtractor) poiExtractor);
+ } else if (document instanceof XSSFWorkbook) {
+ extractor = new XSSFExcelExtractorDecorator(
+ (XSSFExcelExtractor) poiExtractor, locale);
+ } else if (document instanceof XWPFDocument) {
+ extractor = new XWPFWordExtractorDecorator(
+ (XWPFWordExtractor) poiExtractor);
+ } else {
+ extractor = new POIXMLTextExtractorDecorator(poiExtractor);
+ }
+
+ extractor.getMetadataExtractor().extract(metadata);
+ extractor.getXHTML(handler, metadata);
+ } catch (IllegalArgumentException e) {
+ if (e.getMessage().startsWith("No supported documents found")) {
+ throw new TikaException(
+ "TIKA-418: RuntimeException while getting content"
+ + " for thmx and xps file types", e);
+ } else {
+ throw new TikaException("Error creating OOXML extractor", e);
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+
+ }
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Thu Sep 2 14:40:55 2010
@@ -1,81 +1,81 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Office Open XML (OOXML) parser.
- *
- */
-public class OOXMLParser implements Parser {
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("x-tika-ooxml"),
- MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
- MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
- MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
- MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
- MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
- MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
- MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
- MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
- MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
- MediaType.application("vnd.ms-excel.template.macroenabled.12"),
- MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
- MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
- MediaType.application("vnd.ms-word.document.macroenabled.12"),
- MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
- MediaType.application("vnd.ms-word.template.macroenabled.12"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- Locale locale = context.get(Locale.class, Locale.getDefault());
- OOXMLExtractorFactory.parse(stream, handler, metadata, locale);
- }
-
- /**
- * @deprecated This method will be removed in Apache Tika 1.0.
- */
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- parse(stream, handler, metadata, new ParseContext());
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Office Open XML (OOXML) parser.
+ *
+ */
+public class OOXMLParser implements Parser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("x-tika-ooxml"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+ MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
+ MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
+ MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+ MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
+ MediaType.application("vnd.ms-excel.template.macroenabled.12"),
+ MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
+ MediaType.application("vnd.ms-word.document.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
+ MediaType.application("vnd.ms-word.template.macroenabled.12"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ OOXMLExtractorFactory.parse(stream, handler, metadata, locale);
+ }
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Thu Sep 2 14:40:55 2010
@@ -1,34 +1,34 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
-
- public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
- super(extractor, null);
- }
-
- @Override
- protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
- // extract document content as a single string (not structured)
- xhtml.element("p", extractor.getText());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
+ super(extractor, null);
+ }
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
+ // extract document content as a single string (not structured)
+ xhtml.element("p", extractor.getText());
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Thu Sep 2 14:40:55 2010
@@ -1,96 +1,96 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-
-import org.apache.poi.xslf.XSLFSlideShow;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xslf.usermodel.XMLSlideShow;
-import org.apache.poi.xslf.usermodel.XSLFSlide;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.xmlbeans.XmlException;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-import org.xml.sax.SAXException;
-
-public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
-
- public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
- super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
- }
-
- /**
- * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
- */
- @Override
- protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
- XmlException, IOException {
- XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
- XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
-
- XSLFSlide[] slides = xmlSlideShow.getSlides();
- for (XSLFSlide slide : slides) {
- CTSlide rawSlide = slide._getCTSlide();
- CTSlideIdListEntry slideId = slide._getCTSlideId();
-
- CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
- slideId);
- CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
- .getSlideComments(slideId);
-
- xhtml.startElement("div");
- extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
-
- if (comments != null) {
- for (CTComment comment : comments.getCmArray()) {
- xhtml.element("p", comment.getText());
- }
- }
-
- if (notes != null) {
- extractShapeContent(notes.getCSld().getSpTree(), xhtml);
- }
- xhtml.endElement("div");
- }
- }
-
- private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
- throws SAXException {
- CTShape[] shapes = gs.getSpArray();
- for (CTShape shape : shapes) {
- CTTextBody textBody = shape.getTxBody();
- if (textBody != null) {
- CTTextParagraph[] paras = textBody.getPArray();
- for (CTTextParagraph textParagraph : paras) {
- CTRegularTextRun[] textRuns = textParagraph.getRArray();
- for (CTRegularTextRun textRun : textRuns) {
- xhtml.element("p", textRun.getT());
- }
- }
- }
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.xml.sax.SAXException;
+
+public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
+ super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ }
+
+ /**
+ * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
+ XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
+
+ XSLFSlide[] slides = xmlSlideShow.getSlides();
+ for (XSLFSlide slide : slides) {
+ CTSlide rawSlide = slide._getCTSlide();
+ CTSlideIdListEntry slideId = slide._getCTSlideId();
+
+ CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
+ slideId);
+ CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
+ .getSlideComments(slideId);
+
+ xhtml.startElement("div");
+ extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
+
+ if (comments != null) {
+ for (CTComment comment : comments.getCmArray()) {
+ xhtml.element("p", comment.getText());
+ }
+ }
+
+ if (notes != null) {
+ extractShapeContent(notes.getCSld().getSpTree(), xhtml);
+ }
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
+ throws SAXException {
+ CTShape[] shapes = gs.getSpArray();
+ for (CTShape shape : shapes) {
+ CTTextBody textBody = shape.getTxBody();
+ if (textBody != null) {
+ CTTextParagraph[] paras = textBody.getPArray();
+ for (CTTextParagraph textParagraph : paras) {
+ CTRegularTextRun[] textRuns = textParagraph.getRArray();
+ for (CTRegularTextRun textRun : textRuns) {
+ xhtml.element("p", textRun.getT());
+ }
+ }
+ }
+ }
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=991955&r1=991954&r2=991955&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Thu Sep 2 14:40:55 2010
@@ -1,162 +1,162 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.Locale;
-
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Comment;
-import org.apache.poi.ss.usermodel.DataFormatter;
-import org.apache.poi.ss.usermodel.HeaderFooter;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFCell;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.exception.TikaException;
-import org.apache.xmlbeans.XmlException;
-import org.xml.sax.SAXException;
-
-public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
-
- /**
- * Internal <code>DataFormatter</code> for formatting Numbers.
- */
- private final DataFormatter formatter;
-
- private final XSSFExcelExtractor extractor;
- private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
-
- public XSSFExcelExtractorDecorator(
- XSSFExcelExtractor extractor, Locale locale) {
- super(extractor, TYPE);
-
- this.extractor = extractor;
- formatter = new DataFormatter(locale);
- }
-
- /**
- * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
- */
- @Override
- protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
- XmlException, IOException {
- XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
-
- for (int i = 0; i < document.getNumberOfSheets(); i++) {
- xhtml.startElement("div");
- XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
- xhtml.element("h1", document.getSheetName(i));
-
- // Header(s), if present
- extractHeaderFooter(sheet.getFirstHeader(), xhtml);
- extractHeaderFooter(sheet.getOddHeader(), xhtml);
- extractHeaderFooter(sheet.getEvenHeader(), xhtml);
-
- xhtml.startElement("table");
- xhtml.startElement("tbody");
-
- // Rows and cells
- for (Object rawR : sheet) {
- xhtml.startElement("tr");
- Row row = (Row) rawR;
- for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
- xhtml.startElement("td");
- Cell cell = ri.next();
-
- int type = cell.getCellType();
- if (type == Cell.CELL_TYPE_FORMULA) {
- type = cell.getCachedFormulaResultType();
- }
- if (type == Cell.CELL_TYPE_STRING) {
- xhtml.characters(cell.getRichStringCellValue()
- .getString());
- } else if (type == Cell.CELL_TYPE_NUMERIC) {
- CellStyle style = cell.getCellStyle();
- xhtml.characters(
- formatter.formatRawCellContents(cell.getNumericCellValue(),
- style.getDataFormat(),
- style.getDataFormatString()));
- } else {
- XSSFCell xc = (XSSFCell) cell;
- String rawValue = xc.getRawValue();
- if (rawValue != null) {
- xhtml.characters(rawValue);
- }
-
- }
-
- // Output the comment in the same cell as the content
- Comment comment = cell.getCellComment();
- if (comment != null) {
- xhtml.characters(comment.getString().getString());
- }
-
- xhtml.endElement("td");
- }
- xhtml.endElement("tr");
- }
-
- xhtml.endElement("tbody");
- xhtml.endElement("table");
-
- // Finally footer(s), if present
- extractHeaderFooter(sheet.getFirstFooter(), xhtml);
- extractHeaderFooter(sheet.getOddFooter(), xhtml);
- extractHeaderFooter(sheet.getEvenFooter(), xhtml);
-
- xhtml.endElement("div");
- }
- }
-
- private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
- throws SAXException {
- String content = ExcelExtractor._extractHeaderFooter(hf);
- if (content.length() > 0) {
- xhtml.element("p", content);
- }
- }
-
- @Override
- public MetadataExtractor getMetadataExtractor() {
- return new MetadataExtractor(extractor, TYPE) {
- @Override
- public void extract(Metadata metadata) throws TikaException {
- super.extract(metadata);
-
- metadata.set(TikaMetadataKeys.PROTECTED, "false");
-
- XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
-
- for (int i = 0; i < document.getNumberOfSheets(); i++) {
- XSSFSheet sheet = document.getSheetAt(i);
-
- if (sheet.getProtect()) {
- metadata.set(TikaMetadataKeys.PROTECTED, "true");
- }
- }
- }
- };
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Locale;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Comment;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.HeaderFooter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.exception.TikaException;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.SAXException;
+
+public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+
+ /**
+ * Internal <code>DataFormatter</code> for formatting Numbers.
+ */
+ private final DataFormatter formatter;
+
+ private final XSSFExcelExtractor extractor;
+ private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
+
+ public XSSFExcelExtractorDecorator(
+ XSSFExcelExtractor extractor, Locale locale) {
+ super(extractor, TYPE);
+
+ this.extractor = extractor;
+ formatter = new DataFormatter(locale);
+ }
+
+ /**
+ * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+ for (int i = 0; i < document.getNumberOfSheets(); i++) {
+ xhtml.startElement("div");
+ XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
+ xhtml.element("h1", document.getSheetName(i));
+
+ // Header(s), if present
+ extractHeaderFooter(sheet.getFirstHeader(), xhtml);
+ extractHeaderFooter(sheet.getOddHeader(), xhtml);
+ extractHeaderFooter(sheet.getEvenHeader(), xhtml);
+
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ // Rows and cells
+ for (Object rawR : sheet) {
+ xhtml.startElement("tr");
+ Row row = (Row) rawR;
+ for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
+ xhtml.startElement("td");
+ Cell cell = ri.next();
+
+ int type = cell.getCellType();
+ if (type == Cell.CELL_TYPE_FORMULA) {
+ type = cell.getCachedFormulaResultType();
+ }
+ if (type == Cell.CELL_TYPE_STRING) {
+ xhtml.characters(cell.getRichStringCellValue()
+ .getString());
+ } else if (type == Cell.CELL_TYPE_NUMERIC) {
+ CellStyle style = cell.getCellStyle();
+ xhtml.characters(
+ formatter.formatRawCellContents(cell.getNumericCellValue(),
+ style.getDataFormat(),
+ style.getDataFormatString()));
+ } else {
+ XSSFCell xc = (XSSFCell) cell;
+ String rawValue = xc.getRawValue();
+ if (rawValue != null) {
+ xhtml.characters(rawValue);
+ }
+
+ }
+
+ // Output the comment in the same cell as the content
+ Comment comment = cell.getCellComment();
+ if (comment != null) {
+ xhtml.characters(comment.getString().getString());
+ }
+
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Finally footer(s), if present
+ extractHeaderFooter(sheet.getFirstFooter(), xhtml);
+ extractHeaderFooter(sheet.getOddFooter(), xhtml);
+ extractHeaderFooter(sheet.getEvenFooter(), xhtml);
+
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
+ throws SAXException {
+ String content = ExcelExtractor._extractHeaderFooter(hf);
+ if (content.length() > 0) {
+ xhtml.element("p", content);
+ }
+ }
+
+ @Override
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor, TYPE) {
+ @Override
+ public void extract(Metadata metadata) throws TikaException {
+ super.extract(metadata);
+
+ metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+ XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+ for (int i = 0; i < document.getNumberOfSheets(); i++) {
+ XSSFSheet sheet = document.getSheetAt(i);
+
+ if (sheet.getProtect()) {
+ metadata.set(TikaMetadataKeys.PROTECTED, "true");
+ }
+ }
+ }
+ };
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native