You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/30 21:30:36 UTC
[6/7] tika git commit: TIKA-1321 -- add SAX based docx parser and
integrate it with the recent 2006ml parser work -- initial commit
TIKA-1321 -- add SAX based docx parser and integrate it with the recent 2006ml parser work -- initial commit
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d19e4725
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d19e4725
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d19e4725
Branch: refs/heads/master
Commit: d19e4725ff0549597f9156bb0c1e7759f6ce08d9
Parents: 7b4f6fa
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 15:24:57 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 15:24:57 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../java/org/apache/tika/utils/DateUtils.java | 72 +-
.../parser/microsoft/AbstractOfficeParser.java | 69 +
.../parser/microsoft/MSOfficeParserConfig.java | 38 -
.../tika/parser/microsoft/OfficeParser.java | 5 +-
.../parser/microsoft/OfficeParserConfig.java | 82 +
.../microsoft/ooxml/MetadataExtractor.java | 4 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 36 +-
.../parser/microsoft/ooxml/OOXMLParser.java | 6 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 222 ++
.../parser/microsoft/ooxml/XWPFListManager.java | 25 +-
.../ooxml/XWPFWordExtractorDecorator.java | 2 +-
.../microsoft/ooxml/xwpf/BinaryDataHandler.java | 120 -
.../ooxml/xwpf/BodyContentHandler.java | 271 --
.../ooxml/xwpf/CorePropertiesHandler.java | 144 -
.../ooxml/xwpf/ExtendedPropertiesHandler.java | 67 -
.../microsoft/ooxml/xwpf/PartHandler.java | 43 -
.../microsoft/ooxml/xwpf/Relationship.java | 52 -
.../ooxml/xwpf/RelationshipsHandler.java | 86 -
.../ooxml/xwpf/RelationshipsManager.java | 58 -
.../microsoft/ooxml/xwpf/Word2006MLHandler.java | 168 -
.../microsoft/ooxml/xwpf/Word2006MLParser.java | 67 -
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 318 ++
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 353 ++
.../microsoft/ooxml/xwpf/XWPFRunProperties.java | 44 +
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 224 ++
.../ooxml/xwpf/ml2006/AbstractPartHandler.java | 43 +
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 120 +
.../ooxml/xwpf/ml2006/BodyPartHandler.java | 64 +
.../xwpf/ml2006/CorePropertiesHandler.java | 144 +
.../xwpf/ml2006/ExtendedPropertiesHandler.java | 67 +
.../ooxml/xwpf/ml2006/PartHandler.java | 34 +
.../ooxml/xwpf/ml2006/Relationship.java | 52 +
.../ooxml/xwpf/ml2006/RelationshipsHandler.java | 86 +
.../ooxml/xwpf/ml2006/RelationshipsManager.java | 58 +
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 171 +
.../ooxml/xwpf/ml2006/Word2006MLParser.java | 71 +
.../services/org.apache.tika.parser.Parser | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 31 +
.../ooxml/xwpf/SXWPFExtractorTest.java | 187 +
.../ooxml/xwpf/Word2006MLParserTest.java | 182 -
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 171 +
.../test-documents/testWORD_2003ml.xml | 2058 +++++++---
.../test-documents/testWORD_2006ml.docx | Bin 0 -> 165566 bytes
.../test-documents/testWORD_2006ml.xml | 3621 +++++++++++++++---
.../test-documents/testWORD_2006ml_src.docx | Bin 99960 -> 0 bytes
46 files changed, 7350 insertions(+), 2391 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 17128b7..2462e1a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??
+ * Added experimental SAX parser for .docx files. To select this parser,
+ set useSAXDocxExtractor(true) on OfficeConfig (TIKA-1321).
+
* Change default behavior to parse embedded documents even if the user
forgets to specify a Parser.class in the ParseContext (TIKA-2096).
Users who wish to parse only the container document should set
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index 6b764c8..6ccc74e 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -16,9 +16,14 @@
*/
package org.apache.tika.utils;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
+import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
@@ -26,6 +31,7 @@ import java.util.TimeZone;
* Date related utility methods and constants
*/
public class DateUtils {
+
/**
* The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)}
* understands "UTC" in all environments, but it'll fall back to GMT
@@ -44,8 +50,49 @@ public class DateUtils {
*/
public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00");
+ private static ThreadLocal<DateFormat> createDateFormat(String format, TimeZone timezone) {
+ final SimpleDateFormat sdf =
+ new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+ if (timezone != null) {
+ sdf.setTimeZone(timezone);
+ }
+ return new ThreadLocal<DateFormat>() {
+ @Override
+ public DateFormat initialValue() {
+ return sdf;
+ }
+ };
+ }
+
+ /**
+ * Some parsers will have the date as a ISO-8601 string
+ * already, and will set that into the Metadata object.
+ * So we can return Date objects for these, this is the
+ * list (in preference order) of the various ISO-8601
+ * variants that we try when processing a date based
+ * property.
+ */
+ private static final List<ThreadLocal<DateFormat>> ISO_8601_INPUT_FORMATS = loadDateFormats();
+
+ private static List<ThreadLocal<DateFormat>> loadDateFormats() {
+ List<ThreadLocal<DateFormat>> dateFormats = new ArrayList<>();
+ // yyyy-mm-ddThh...
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone
+ // yyyy-mm-dd hh...
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone
+ // Date without time, set to Midday UTC
+ dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format
+ dateFormats.add(createDateFormat("yyyy:MM:dd", MIDDAY)); // Image (IPTC/EXIF) format
+
+ return dateFormats;
+ }
+
/**
- * Returns a ISO 8601 representation of the given date. This method
+ * Returns a ISO 8601 representation of the given date. This method
* is thread safe and non-blocking.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
@@ -58,7 +105,7 @@ public class DateUtils {
return doFormatDate(calendar);
}
/**
- * Returns a ISO 8601 representation of the given date. This method
+ * Returns a ISO 8601 representation of the given date. This method
* is thread safe and non-blocking.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
@@ -66,7 +113,7 @@ public class DateUtils {
* @return ISO 8601 date string, including timezone details
*/
public static String formatDate(Calendar date) {
- // Explicitly switch it into UTC before formatting
+ // Explicitly switch it into UTC before formatting
date.setTimeZone(UTC);
return doFormatDate(date);
}
@@ -98,4 +145,23 @@ public class DateUtils {
calendar.get(Calendar.MINUTE),
calendar.get(Calendar.SECOND));
}
+
+ /**
+ * Tries to parse the date string; returns null if no parse was possible.
+ *
+ * This is thread safe because it relies on threadlocal dateformats.
+ *
+ * @param dateString
+ * @return
+ */
+ public static Date tryToParse(String dateString) {
+ for (ThreadLocal<DateFormat> df : ISO_8601_INPUT_FORMATS) {
+ try {
+ return df.get().parse(dateString);
+ } catch (java.text.ParseException e){
+
+ }
+ }
+ return null;
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
new file mode 100644
index 0000000..d8186bc
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -0,0 +1,69 @@
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Intermediate layer to set {@link OfficeParserConfig} uniformly.
+ */
+public abstract class AbstractOfficeParser extends AbstractParser {
+
+ private final OfficeParserConfig defaultOfficeParserConfig = new OfficeParserConfig();
+
+ /**
+ * Checks to see if the user has specified an {@link OfficeParserConfig}.
+ * If so, no changes are made; if not, one is added to the context.
+ *
+ * @param parseContext
+ */
+ public void configure(ParseContext parseContext) {
+ OfficeParserConfig officeParserConfig = parseContext.get(OfficeParserConfig.class, defaultOfficeParserConfig);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ }
+
+ /**
+ * @see OfficeParserConfig#getIncludeDeletedContent
+ *
+ * @return
+ */
+ public boolean getIncludeDeletedContent() {
+ return defaultOfficeParserConfig.getIncludeDeletedContent();
+ }
+
+ /**
+ * @see OfficeParserConfig#getIncludeMoveFromContent()
+ *
+ * @return
+ */
+
+ public boolean getIncludeMoveFromContent() {
+ return defaultOfficeParserConfig.getIncludeMoveFromContent();
+ }
+
+ /**
+ * @see OfficeParserConfig#getUseSAXDocxExtractor()
+ *
+ * @return
+ */
+ public boolean getUseSAXDocxExtractor() {
+ return defaultOfficeParserConfig.getUseSAXDocxExtractor();
+ }
+
+
+ @Field
+ public void setIncludeDeletedContent(boolean includeDeletedConent) {
+ defaultOfficeParserConfig.setIncludeDeletedContent(includeDeletedConent);
+ }
+
+ @Field
+ public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
+ defaultOfficeParserConfig.setIncludeMoveFromContent(includeMoveFromContent);
+ }
+
+ @Field
+ public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
+ defaultOfficeParserConfig.setUseSAXDocxExtractor(useSAXDocxExtractor);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
deleted file mode 100644
index 8f8086a..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-
-public class MSOfficeParserConfig {
-
- private boolean includeDeletedContent = true;
-
- /**
- * Sets whether or not the parser should include deleted content.
- * <b>This has not been implemented in all MSOffice parsers yet!!!</b>
- * @param includeDeletedContent
- */
- public void setIncludeDeletedContent(boolean includeDeletedContent) {
- this.includeDeletedContent = includeDeletedContent;
- }
-
- public boolean getIncludeDeletedContent() {
- return includeDeletedContent;
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 5218dfa..7e21ba8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -48,7 +48,6 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -61,7 +60,7 @@ import org.xml.sax.SAXException;
/**
* Defines a Microsoft document content extractor.
*/
-public class OfficeParser extends AbstractParser {
+public class OfficeParser extends AbstractOfficeParser {
/**
* Serial version UID
@@ -98,6 +97,8 @@ public class OfficeParser extends AbstractParser {
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+
+ configure(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
new file mode 100644
index 0000000..55f4673
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.Serializable;
+
+public class OfficeParserConfig implements Serializable {
+
+ private boolean includeDeletedContent = true;
+ private boolean includeMoveFromContent = false;
+
+ private boolean useSAXDocxExtractor = false;
+
+ /**
+ * Sets whether or not the parser should include deleted content.
+ * <p/>
+ * <b>This has only been implemented in the streaming docx parser
+ * ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!</b>
+ * @param includeDeletedContent
+ */
+ public void setIncludeDeletedContent(boolean includeDeletedContent) {
+ this.includeDeletedContent = includeDeletedContent;
+ }
+
+ public boolean getIncludeDeletedContent() {
+ return includeDeletedContent;
+ }
+
+ /**
+ * With track changes on, when a section is moved, the content
+ * is stored in both the "moveFrom" section and in the "moveTo" section.
+ * <p/>
+ * If you'd like to include the section both in its original location (moveFrom)
+ * and in its new location (moveTo), set this to <code>true</code>
+ * <p/>
+ * Default: <code>false</code>
+ * <p/>
+ * <b>This has only been implemented in the streaming docx parser
+ * ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!</b>
+ * @param includeMoveFromContent
+ */
+ public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
+ this.includeMoveFromContent = includeMoveFromContent;
+ }
+
+ public boolean getIncludeMoveFromContent() {
+ return includeMoveFromContent;
+ }
+
+ public boolean getUseSAXDocxExtractor() {
+ return useSAXDocxExtractor;
+ }
+
+ /**
+ * Use the experimental SAX-based streaming DOCX parser?
+ * If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
+ * the new experimental parser will be used.
+ * <p/>
+ * Default: classic parser
+ * @param useSAXDocxExtractor
+ */
+ public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
+ this.useSAXDocxExtractor = useSAXDocxExtractor;
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 91d49c7..d392346 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
@@ -57,7 +58,8 @@ public class MetadataExtractor {
public void extract(Metadata metadata) throws TikaException {
if (extractor.getDocument() != null ||
- (extractor instanceof XSSFEventBasedExcelExtractor &&
+ ((extractor instanceof XSSFEventBasedExcelExtractor ||
+ extractor instanceof XWPFEventBasedWordExtractor) &&
extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index e2c7717..bbee6b7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -28,17 +28,22 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
@@ -83,12 +88,22 @@ public class OOXMLExtractorFactory {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
- POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg);
+ POIXMLTextExtractor poiExtractor = null;
+ OfficeParserConfig config = context.get(OfficeParserConfig.class, new OfficeParserConfig());
+ if (config.getUseSAXDocxExtractor()) {
+ poiExtractor = trySXWPF(pkg);
+ }
+ if (poiExtractor == null) {
+ poiExtractor = ExtractorFactory.createExtractor(pkg);
+ }
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(
context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
+ } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
+ extractor = new SXWPFWordExtractorDecorator(context,
+ (XWPFEventBasedWordExtractor)poiExtractor);
} else if (document == null) {
throw new TikaException(
"Expecting UserModel based POI OOXML extractor with a document, but none found. " +
@@ -129,4 +144,23 @@ public class OOXMLExtractorFactory {
}
}
+ private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
+ PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
+ if(packageRelationshipCollection.size() == 0) {
+ packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
+ }
+
+ if (packageRelationshipCollection.size() == 0) {
+ return null;
+ }
+ PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
+ String targetContentType = corePart.getContentType();
+ for (XWPFRelation relation : XWPFWordExtractor.SUPPORTED_TYPES) {
+ if (targetContentType.equals(relation.getContentType())) {
+ return new XWPFEventBasedWordExtractor(pkg);
+ }
+ }
+ return null;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index 22f2cac..a6d0b34 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -27,15 +27,15 @@ import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.AbstractOfficeParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Office Open XML (OOXML) parser.
*/
-public class OOXMLParser extends AbstractParser {
+public class OOXMLParser extends AbstractOfficeParser {
static {
//turn off POI's zip bomb detection because we have our own
ZipSecureFile.setMinInflateRatio(-1.0d);
@@ -83,6 +83,8 @@ public class OOXMLParser extends AbstractParser {
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ //set OfficeParserConfig if the user hasn't specified one
+ configure(context);
// Have the OOXML file processed
OOXMLExtractorFactory.parse(stream, handler, metadata, context);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
new file mode 100644
index 0000000..e08dab1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.SAXException;
+
+/**
+ * This is an experimental, alternative extractor for docx files.
+ * This streams the main document content rather than loading the
+ * full document into memory.
+ * <p>
+ * This will be better for some use cases than the classic docx extractor; and,
+ * it will be worse for others.
+ * </p>
+ *
+ * @since 1.15
+ */
+public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+
+ private final OPCPackage opcPackage;
+ private final ParseContext context;
+
+ public SXWPFWordExtractorDecorator(ParseContext context,
+ XWPFEventBasedWordExtractor extractor) {
+ super(context, extractor);
+ this.context = context;
+ this.opcPackage = extractor.getPackage();
+ }
+
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ //handle main document
+ List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ if (pps != null) {
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ handleDocumentPart(pp, xhtml);
+ }
+ }
+ //handle glossary document
+ pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+ if (pps != null) {
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ handleDocumentPart(pp, xhtml);
+ }
+ }
+ }
+
+ private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException {
+ //load the numbering/list manager and styles from the main document part
+ XWPFNumbering numbering = loadNumbering(documentPart);
+ XWPFListManager xwpfListManager = new XWPFListManager(numbering);
+ //TODO: XWPFStyles styles = loadStyles(documentPart);
+
+ //headers
+ try {
+ PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+ if (headersPRC != null) {
+ for (int i = 0; i < headersPRC.size(); i++) {
+ PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
+ handlePart(header, xwpfListManager, xhtml);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+
+ //main document
+ handlePart(documentPart, xwpfListManager, xhtml);
+
+ //for now, just dump other components at end
+ for (XWPFRelation rel : new XWPFRelation[]{
+ XWPFRelation.FOOTNOTE,
+ XWPFRelation.COMMENT,
+ XWPFRelation.FOOTER,
+ XWPFRelation.ENDNOTE
+ }) {
+ try {
+ PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ handlePart(packagePart, xwpfListManager, xhtml);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ }
+ }
+
+ private void handlePart(PackagePart packagePart,
+ XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
+
+ Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ try (InputStream stream = packagePart.getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new XWPFDocumentXMLBodyHandler(
+ new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
+ context.get(OfficeParserConfig.class)), hyperlinks))));
+ } catch (TikaException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
+ Map<String, String> hyperlinks = new HashMap<>();
+ try {
+ PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ String id = pr.getId();
+ String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ if (id != null && url != null) {
+ hyperlinks.put(id, url);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ return hyperlinks;
+ }
+/*
+ private XWPFStyles loadStyles(PackagePart packagePart) {
+ try {
+ PackageRelationshipCollection stylesParts =
+ packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
+ if (stylesParts.size() > 0) {
+ PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
+ if (stylesRelationShip == null) {
+ return null;
+ }
+ PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
+ if (stylesPart == null) {
+ return null;
+ }
+ return new XWPFStyles(stylesPart);
+ }
+ } catch (IOException|OpenXML4JException e) {
+ //swallow
+ }
+ return null;
+
+ }
+*/
+ private XWPFNumbering loadNumbering(PackagePart packagePart) {
+ try {
+ PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
+ if (numberingParts.size() > 0) {
+ PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
+ if (numberingRelationShip == null) {
+ return null;
+ }
+ PackagePart numberingPart = opcPackage.getPart(numberingRelationShip);
+ if (numberingPart == null) {
+ return null;
+ }
+ return new XWPFNumbering(numberingPart);
+ }
+ } catch (IOException | OpenXML4JException e) {
+ //swallow
+ }
+ return null;
+ }
+
+ /**
+ * This returns the main document only.
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() {
+ return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index a938c2f..2a99126 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -16,8 +16,9 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.math.BigInteger;
+
import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFNum;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@@ -47,8 +48,8 @@ public class XWPFListManager extends AbstractListManager {
private final XWPFNumbering numbering;
//map of numId (which paragraph series is this a member of?), levelcounts
- public XWPFListManager(XWPFDocument document) {
- numbering = document.getNumbering();
+ public XWPFListManager(XWPFNumbering numbering) {
+ this.numbering = numbering;
}
/**
@@ -57,12 +58,19 @@ public class XWPFListManager extends AbstractListManager {
* @return the formatted number or an empty string if something went wrong
*/
public String getFormattedNumber(final XWPFParagraph paragraph) {
- if (numbering == null) {
+ return getFormattedNumber(paragraph.getNumID(),
+ paragraph.getNumIlvl() == null ? -1 : paragraph.getNumIlvl().intValue());
+ }
+
+ public String getFormattedNumber(BigInteger numId, int iLvl) {
+ if (numbering == null || iLvl < 0 || numId == null) {
return "";
}
- int currNumId = paragraph.getNumID().intValue();
- XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+ int currNumId = numId.intValue();
+
+ XWPFNum xwpfNum = numbering.getNum(numId);
+
if (xwpfNum == null) {
return "";
}
@@ -79,14 +87,15 @@ public class XWPFListManager extends AbstractListManager {
overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
}
- String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+ String formattedString = lc.incrementLevel(iLvl, overrideTuples);
listLevelMap.put(currAbNumId, lc);
overrideTupleMap.put(currNumId, overrideTuples);
return formattedString;
+
}
-
+
private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
LevelTuple[] levelTuples = new LevelTuple[length];
int overrideLength = ctNum.sizeOfLvlOverrideArray();
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index da3a606..ccbf45e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -83,7 +83,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
- XWPFListManager listManager = new XWPFListManager(document);
+ XWPFListManager listManager = new XWPFListManager(document.getNumbering());
// headers
if (hfPolicy != null) {
extractHeaders(xhtml, hfPolicy, listManager);
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
deleted file mode 100644
index c2177cf..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class BinaryDataHandler extends PartHandler {
-
- private final XHTMLContentHandler handler;
- private final Metadata metadata;
- private final ParseContext parseContext;
-
- private boolean inBinaryData = false;
- private StringBuilder buffer = new StringBuilder();
-
- final Base64 base64 = new Base64();
-
-
- public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
- this.handler = handler;
- this.metadata = metadata;
- this.parseContext = context;
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
-
- }
-
- @Override
- void endPart() throws SAXException, TikaException {
- if (hasData()) {
- EmbeddedDocumentExtractor embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
- Metadata embeddedMetadata = new Metadata();
- try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
- embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
- } catch (IOException e) {
- throw new TikaException("error in finishing part", e);
- }
- buffer.setLength(0);
- }
-
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-
- if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) {
- inBinaryData = true;
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) {
- inBinaryData = false;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (inBinaryData) {
- buffer.append(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-
- }
-
- @Override
- public String getPartContentType() {
- return "";
- }
-
- boolean hasData() {
- return buffer.length() > 0;
- }
-
- private InputStream getInputStream() {
- byte[] bytes = base64.decode(buffer.toString());
- return new ByteArrayInputStream(bytes);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
deleted file mode 100644
index ea16191..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-class BodyContentHandler extends PartHandler {
-
-
- private enum EditType{
- NONE,
- INSERT,
- DELETE
- };
-
- private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
- private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
- private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
- private final static char[] TAB = new char[1];
-
- static {
- TAB[0] = '\t';
- }
-
- private final String partName;
- private final RelationshipsManager relationshipsManager;
- private final XHTMLContentHandler handler;
- private final Metadata metadata;
- private final ParseContext parseContext;
- private final boolean includeDeletedContent;
-
- private boolean inR = false;
- private boolean inT = false;
- private boolean inRPr = false;
- private boolean inDelText = false;
- private boolean inAlternateContent = false; //in alternate content section
- private boolean inACChoice = false; //if in alternate, choice or fallback?
- private boolean inACFallback = false;
- private boolean hasWrittenAHref = false;
- private boolean hasWrittenFormatting = false;
- private String editAuthor = null;
- private String editDate = null;
- private EditType editType = EditType.NONE;
- private String hyperlink = null;
-
- private TmpFormatting currFormat = new TmpFormatting();
-
- public BodyContentHandler(String partName, RelationshipsManager relationshipsManager,
- XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
- this.partName = partName;
- this.relationshipsManager = relationshipsManager;
- this.handler = handler;
- this.metadata = metadata;
- this.parseContext = context;
- MSOfficeParserConfig config = context.get(MSOfficeParserConfig.class);
- boolean tmpIncludeDeleted = true;
- if (config != null) {
- tmpIncludeDeleted = config.getIncludeDeletedContent();
- }
- includeDeletedContent = tmpIncludeDeleted;
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- if (uri.equals(MC_NS)) {
- if (localName.equals("AlternateContent")) {
- inAlternateContent = true;
- } else if (localName.equals("Choice")) {
- inACChoice = true;
- } else if (localName.equals("Fallback")) {
- inACFallback = true;
- }
- }
- if (inACFallback) {
- return;
- }
-
- if (uri.equals(W_NS)) {
- if (localName.equals("p")) {
- handler.startElement("p");
- } else if (localName.equals("r")) {
- inR = true;
- } else if (localName.equals("t")) {
- inT = true;
- } else if (localName.equals("tab")) {
- handler.characters(TAB, 0, 1);
- } else if (localName.equals("tbl")) {
- handler.startElement("table");
- } else if (localName.equals("tc")) {
- handler.startElement("td");
- } else if (localName.equals("tr")) {
- handler.startElement("tr");
- } else if (localName.equals("rPr")) {
- inRPr = true;
- } else if (inR && inRPr && localName.equals("i")) {
- //rprs don't have to be inR; ignore those that aren't
- currFormat.italics = true;
- } else if (inR && inRPr && localName.equals("b")) {
- currFormat.bold = true;
- } else if (localName.equals("delText")) {
- inDelText = true;
- } else if (localName.equals("ins")) {
- editAuthor = atts.getValue(W_NS, "author");
- editDate = atts.getValue(W_NS, "date");
- editType = EditType.INSERT;
- } else if (localName.equals("del")) {
- editAuthor = atts.getValue(W_NS, "author");
- editDate = atts.getValue(W_NS, "date");
- editType = EditType.DELETE;
- } else if (localName.equals("hyperlink")) {
- String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
- if (hyperlinkId != null) {
- Relationship relationship = relationshipsManager.getRelationship(getName(), hyperlinkId);
- if (relationship != null && XWPFRelation.HYPERLINK.getRelation().equals(relationship.getContentType())) {
- hyperlink = relationship.getTarget();
- handler.startElement("a", "href", hyperlink);
- hasWrittenAHref = true;
- }
- }
- }
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (uri.equals(MC_NS)) {
- if (localName.equals("AlternateContent")) {
- inAlternateContent = false;
- } else if (localName.equals("Choice")) {
- inACChoice = false;
- } else if (localName.equals("Fallback")) {
- inACFallback = false;
- }
- }
- if (uri.equals(W_NS)) {
- if (inACFallback) {
- return;
- }
- if (localName.equals("p")) {
- handler.endElement("p");
- } else if (localName.equals("r")) {
- closeStyleTags();
- inR = false;
- hasWrittenFormatting = false;
- } else if (localName.equals("t")) {
- inT = false;
- } else if (localName.equals("tbl")) {
- handler.endElement("table");
- } else if (localName.equals("tc")) {
- handler.endElement("td");
- } else if (localName.equals("tr")) {
- handler.endElement("tr");
- } else if (localName.equals("rPr")) {
- inRPr = false;
- } else if (localName.equals("delText")) {
- inDelText = false;
- } else if (localName.equals("ins") || localName.equals("del")) {
- editType = EditType.NONE;
- editAuthor = null;
- editDate = null;
- } else if (localName.equals("hyperlink") && hasWrittenAHref) {
- handler.endElement("a");
- hasWrittenAHref = false;
- }
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (inACFallback) {
- return;
- }
-
- if (inR && !hasWrittenFormatting) {
- if (currFormat.bold) {
- handler.startElement("b");
- }
- if (currFormat.italics) {
- handler.startElement("i");
- }
- hasWrittenFormatting = true;
- }
- if (inT) {
- handler.characters(ch, start, length);
- } else if (includeDeletedContent && inDelText) {
- handler.characters(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- if (inACFallback) {
- return;
- }
-
- if (inT) {
- handler.characters(ch, start, length);
- }
- }
-
- @Override
- public String getPartContentType() {
- return partName;
- }
-
-
-
- void closeStyleTags() throws SAXException {
- if (hasWrittenFormatting) {
- if (currFormat.italics) {
- handler.endElement("i");
- }
- if (currFormat.bold) {
- handler.endElement("b");
- }
- }
-
- currFormat.bold = false;
- currFormat.italics = false;
- }
-
- private class TmpFormatting {
- boolean italics = false;
- boolean bold = false;
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
deleted file mode 100644
index b0bca08..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.openxml4j.opc.ContentTypes;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class CorePropertiesHandler extends PartHandler {
-
- final static String DC_NS = "http://purl.org/dc/elements/1.1";
- final static String DC_TERMS_NS = "http://purl.org/dc/terms";
- final static String CP_NS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties";
-
- private final Metadata metadata;
-
- final StringBuilder buffer = new StringBuilder();
- final Map<String, Map<String, Property>> properties = new HashMap<>();
-
- public CorePropertiesHandler(Metadata metadata) {
- this.metadata = metadata;
- addProperties();
- }
-
- void addProperties() {
- Map<String, Property> dc = properties.get(DC_NS);
- if (dc == null) {
- dc = new HashMap<>();
- }
- dc.put("creator", TikaCoreProperties.CREATOR);
- dc.put("title", TikaCoreProperties.TITLE);
- dc.put("description", TikaCoreProperties.DESCRIPTION);
- properties.put(DC_NS, dc);
-
- Map<String, Property> dcTerms = properties.get(DC_TERMS_NS);
- if (dcTerms == null) {
- dcTerms = new HashMap<>();
- }
- dcTerms.put("created", TikaCoreProperties.CREATED);
- dcTerms.put("modified", TikaCoreProperties.MODIFIED);
-
- properties.put(DC_TERMS_NS, dcTerms);
-
- Map<String, Property> cp = properties.get(CP_NS);
- if (cp == null) {
- cp = new HashMap<>();
- }
- cp.put("category", OfficeOpenXMLCore.CATEGORY);
- cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS);
- cp.put("lastModifiedBy", OfficeOpenXMLCore.LAST_MODIFIED_BY);
- cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED);
- cp.put("revision", OfficeOpenXMLCore.REVISION);
- cp.put("subject", OfficeOpenXMLCore.SUBJECT);
- cp.put("version", OfficeOpenXMLCore.VERSION);
- properties.put(CP_NS, cp);
- }
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- buffer.setLength(0);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- Property prop = getProperty(uri, localName);
- if (prop != null) {
-
- if (prop.isMultiValuePermitted()) {
- metadata.add(prop, buffer.toString());
- } else {
- metadata.set(prop, buffer.toString());
- }
- }
- buffer.setLength(0);
-
- }
-
- private Property getProperty(String uri, String localName) {
- if (uri.endsWith("/")) {
- uri = uri.substring(0, uri.length()-1);
- }
-
- Map<String, Property> m = properties.get(uri);
- if (m != null) {
- return m.get(localName);
- }
- return null;
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- buffer.append(ch, start, length);
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- buffer.append(ch, start, length);
- }
-
- @Override
- public String getPartContentType() {
- return ContentTypes.CORE_PROPERTIES_PART;
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
deleted file mode 100644
index 07e5e23..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-
-class ExtendedPropertiesHandler extends CorePropertiesHandler {
-
- final static String EP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";
-
- public ExtendedPropertiesHandler(Metadata metadata) {
- super(metadata);
- }
-
- @Override
- void addProperties() {
- Map<String, Property> ep = properties.get(EP_NS);
- if (ep == null) {
- ep = new HashMap<>();
- }
- ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION);
- ep.put("Application", OfficeOpenXMLExtended.APPLICATION);
- ep.put("Comments", OfficeOpenXMLExtended.COMMENTS);
- ep.put("Company", OfficeOpenXMLExtended.COMPANY);
- ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY);
- ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES);
- ep.put("Manager", OfficeOpenXMLExtended.MANAGER);
- ep.put("Notes", OfficeOpenXMLExtended.NOTES);
- ep.put("PresentationFormat", OfficeOpenXMLExtended.PRESENTATION_FORMAT);
- ep.put("Template", OfficeOpenXMLExtended.TEMPLATE);
- ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME);
- ep.put("Pages", Office.PAGE_COUNT);
- ep.put("Words", Office.WORD_COUNT);
- ep.put("Characters", Office.CHARACTER_COUNT);
- ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES);
- ep.put("Paragraphs", Office.PARAGRAPH_COUNT);
- ep.put("Lines", Office.LINE_COUNT);
- properties.put(EP_NS, ep);
- }
-
- @Override
- public String getPartContentType() {
- return "application/vnd.openxmlformats-officedocument.extended-properties+xml";
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
deleted file mode 100644
index 79bcafe..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-abstract class PartHandler extends DefaultHandler {
-
- private String name;
-
- public abstract String getPartContentType();
-
- public void setName(String name) {
- this.name = name;
- }
-
- public String getName() {
- return name;
- }
-
- /**
- * Override this to flush buffers, etc if necessary
- */
- void endPart() throws SAXException, TikaException {
-
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
deleted file mode 100644
index 19b0dd4..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.openxml4j.opc.TargetMode;
-
-class Relationship {
-
- private final String contentType;
-
- private final String target;
-
- private final TargetMode targetMode;
-
- public Relationship(String contentType, String target) {
- this(contentType, target, null);
- }
-
- public Relationship(String contentType, String target, TargetMode targetMode) {
- this.contentType = contentType;
- this.target = target;
- this.targetMode = targetMode;
- }
-
- public String getContentType() {
- return contentType;
- }
-
- public String getTarget() {
- return target;
- }
-
- public TargetMode getTargetMode() {
- return targetMode;
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
deleted file mode 100644
index 211b048..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.openxml4j.opc.ContentTypes;
-import org.apache.poi.openxml4j.opc.TargetMode;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class RelationshipsHandler extends PartHandler {
-
- final static String REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships";
-
- private final RelationshipsManager relationshipsManager;
-
- public RelationshipsHandler(RelationshipsManager relationshipsManager) {
- this.relationshipsManager = relationshipsManager;
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- if (uri.equals(REL_NS)) {
- if (localName.equals("Relationship")) {
- String id = atts.getValue("", "Id");
- String type = atts.getValue("", "Type");
- String target = atts.getValue("", "Target");
- String targetModeString = atts.getValue("", "TargetMode");
- TargetMode targetMode = "EXTERNAL".equals(targetModeString)? TargetMode.EXTERNAL :
- TargetMode.INTERNAL;
- relationshipsManager.addRelationship(getName(), id, type, target, targetMode);
- }
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-
- }
-
- @Override
- public String getPartContentType() {
- return ContentTypes.RELATIONSHIPS_PART;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
deleted file mode 100644
index d1954ac..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.openxml4j.opc.TargetMode;
-
-class RelationshipsManager {
-
- Map<String, Map<String, Relationship>> map = new HashMap<>();
-
- public void addRelationship(String relsFileName, String id, String type, String target, TargetMode targetMode) {
- String packageName = convertRelsFileNameToPackageName(relsFileName);
- Map<String, Relationship> thisPackageRels = map.get(packageName);
- if (thisPackageRels == null) {
- thisPackageRels = new HashMap<>();
- }
- thisPackageRels.put(id, new Relationship(type, target, targetMode));
- map.put(packageName, thisPackageRels);
- }
-
- public Relationship getRelationship(String packageName, String id) {
- Map<String, Relationship> thisPackageRels = map.get(packageName);
- if (thisPackageRels != null) {
- return thisPackageRels.get(id);
- }
- return null;
- }
-
- private String convertRelsFileNameToPackageName(String relsFileName) {
- if ("/_rels/.rels".equals(relsFileName)) {
- return "/";
- }
-
- String tmp = relsFileName;
- tmp = tmp.replaceFirst("\\/_rels\\/", "/");
- tmp = tmp.replaceFirst(".rels\\Z", "");
- return tmp;
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
deleted file mode 100644
index cf919cc..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-class Word2006MLHandler extends DefaultHandler {
-
- final static String PKG_NS = "http://schemas.microsoft.com/office/2006/xmlPackage";
-
-
- private final XHTMLContentHandler handler;
- private final Metadata metadata;
- private final ParseContext parseContext;
-
- private final Map<String, PartHandler> partHandlers = new HashMap<>();
- private final BinaryDataHandler binaryDataHandler;
- private final RelationshipsManager relationshipsManager = new RelationshipsManager();
- private PartHandler currentPartHandler = null;
-
- public Word2006MLHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
- this.handler = handler;
- this.metadata = metadata;
- this.parseContext = context;
-
- addPackageHandler(new RelationshipsHandler(relationshipsManager));
-
- addPackageHandler(new BodyContentHandler(
- XWPFRelation.DOCUMENT.getContentType(),
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- XWPFRelation.FOOTNOTE.getContentType(),
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- XWPFRelation.HEADER.getContentType(),
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- XWPFRelation.FOOTER.getContentType(),
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
- relationshipsManager,
- handler, metadata, context));
- addPackageHandler(new BodyContentHandler(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
- relationshipsManager,
- handler, metadata, context));
-
- addPackageHandler(new CorePropertiesHandler(metadata));
- addPackageHandler(new ExtendedPropertiesHandler(metadata));
- binaryDataHandler = new BinaryDataHandler(handler, metadata, context);
- }
-
- private void addPackageHandler(PartHandler partHandler) {
- partHandlers.put(partHandler.getPartContentType(), partHandler);
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
-
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- if (uri.equals(PKG_NS) && localName.equals("part")) {
- //start of a package
- String name = atts.getValue(PKG_NS, "name");
- String contentType = atts.getValue(PKG_NS, "contentType");
- currentPartHandler = partHandlers.get(contentType);
- //for now treat every unknown part type
- //as if it contained binary data
- if (currentPartHandler == null) {
- currentPartHandler = binaryDataHandler;
- }
- if (currentPartHandler != null) {
- currentPartHandler.setName(name);
- }
- } else if (currentPartHandler != null) {
- currentPartHandler.startElement(uri, localName, qName, atts);
- }
-
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (uri.equals(PKG_NS) && localName.equals("part")) {
- //do post processing
- if (currentPartHandler != null) {
- try {
- currentPartHandler.endPart();
- } catch (TikaException e) {
- throw new SAXException(e);
- }
- }
- //then reset
- currentPartHandler = null;
- } else if (currentPartHandler != null) {
- currentPartHandler.endElement(uri, localName, qName);
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (currentPartHandler != null) {
- currentPartHandler.characters(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- if (currentPartHandler != null) {
- currentPartHandler.characters(ch, start, length);
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
deleted file mode 100644
index 4609bf5..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-
-public class Word2006MLParser extends AbstractParser {
-
- protected static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(
- MediaType.application("vnd.ms-word2006ml"));
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- final XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
-
- xhtml.startDocument();
-
- try {
- context.getSAXParser().parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(new EmbeddedContentHandler(
- new Word2006MLHandler(xhtml, metadata, context))));
- } catch (SAXException e) {
- throw new TikaException("XML parse error", e);
- } finally {
- xhtml.endDocument();
- }
- }
-}