You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [18/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-m...
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class NumbersContentHandler extends DefaultHandler {
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ private boolean inSheet = false;
+
+ private boolean inText = false;
+ private boolean parseText = false;
+
+ private boolean inMetadata = false;
+ private Property metadataKey;
+ private String metadataPropertyQName;
+
+ private boolean inTable = false;
+ private int numberOfSheets = 0;
+ private int numberOfColumns = -1;
+ private int currentColumn = 0;
+
+ private Map<String, String> menuItems = new HashMap<String, String>();
+ private String currentMenuItemId;
+
+ NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ if ("ls:workspace".equals(qName)) {
+ inSheet = true;
+ numberOfSheets++;
+ xhtml.startElement("div");
+ String sheetName = attributes.getValue("ls:workspace-name");
+ metadata.add("sheetNames", sheetName);
+ }
+
+ if ("sf:text".equals(qName)) {
+ inText = true;
+ xhtml.startElement("p");
+ }
+
+ if ("sf:p".equals(qName)) {
+ parseText = true;
+ }
+
+ if ("sf:metadata".equals(qName)) {
+ inMetadata = true;
+ return;
+ }
+
+ if (inMetadata && metadataKey == null) {
+ metadataKey = resolveMetadataKey(localName);
+ metadataPropertyQName = qName;
+ }
+
+ if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
+ metadata.add(metadataKey, attributes.getValue("sfa:string"));
+ }
+
+ if (!inSheet) {
+ return;
+ }
+
+ if ("sf:tabular-model".equals(qName)) {
+ String tableName = attributes.getValue("sf:name");
+ xhtml.startElement("div");
+ xhtml.characters(tableName);
+ xhtml.endElement("div");
+ inTable = true;
+ xhtml.startElement("table");
+ xhtml.startElement("tr");
+ currentColumn = 0;
+ }
+
+ if ("sf:menu-choices".equals(qName)) {
+ menuItems = new HashMap<String, String>();
+ }
+
+ if (inTable && "sf:grid".equals(qName)) {
+ numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
+ }
+
+ if (menuItems != null && "sf:t".equals(qName)) {
+ currentMenuItemId = attributes.getValue("sfa:ID");
+ }
+
+ if (currentMenuItemId != null && "sf:ct".equals(qName)) {
+ menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
+ }
+
+ if (inTable && "sf:ct".equals(qName)) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", attributes.getValue("sfa:s"));
+ currentColumn++;
+ }
+
+ if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", attributes.getValue("sf:v"));
+ currentColumn++;
+ }
+
+ if (inTable && "sf:proxied-cell-ref".equals(qName)) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
+ currentColumn++;
+ }
+
+ if ("sf:chart-name".equals(qName)) {
+ // Extract chart name:
+ xhtml.startElement("div", "class", "chart");
+ xhtml.startElement("h1");
+ xhtml.characters(attributes.getValue("sfa:string"));
+ xhtml.endElement("h1");
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (parseText && length > 0) {
+ xhtml.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if ("ls:workspace".equals(qName)) {
+ inSheet = false;
+ xhtml.endElement("div");
+ }
+
+ if ("sf:text".equals(qName)) {
+ inText = false;
+ xhtml.endElement("p");
+ }
+
+ if ("sf:p".equals(qName)) {
+ parseText = false;
+ }
+
+ if ("sf:metadata".equals(qName)) {
+ inMetadata = false;
+ }
+
+ if (inMetadata && qName.equals(metadataPropertyQName)) {
+ metadataPropertyQName = null;
+ metadataKey = null;
+ }
+
+ if (!inSheet) {
+ return;
+ }
+
+ if ("sf:menu-choices".equals(qName)) {
+ }
+
+ if ("sf:tabular-model".equals(qName)) {
+ inTable = false;
+ xhtml.endElement("tr");
+ xhtml.endElement("table");
+ }
+
+ if (currentMenuItemId != null && "sf:t".equals(qName)) {
+ currentMenuItemId = null;
+ }
+ }
+
+ private Property resolveMetadataKey(String localName) {
+ if ("authors".equals(localName)) {
+ return TikaCoreProperties.CREATOR;
+ }
+ if ("title".equals(localName)) {
+ return TikaCoreProperties.TITLE;
+ }
+ if ("comment".equals(localName)) {
+ return TikaCoreProperties.COMMENTS;
+ }
+ return Property.internalText(localName);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,448 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class PagesContentHandler extends DefaultHandler {
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ /** The (interesting) part of the document we're in. Should be more structured... */
+ private enum DocumentPart {
+ METADATA, PARSABLE_TEXT,
+ HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+ FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+ FOOTNOTES, ANNOTATIONS;
+ }
+ private DocumentPart inPart = null;
+ private boolean ghostText;
+
+ private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+ private boolean parseProperty = false;
+ private int pageCount = 0;
+ private int slPageCount = 0;
+
+ private HeaderFooter headers = null;
+ private HeaderFooter footers = null;
+ private Footnotes footnotes = null;
+ private Annotations annotations = null;
+
+ private Map<String, List<List<String>>> tableData =
+ new HashMap<String, List<List<String>>>();
+ private String activeTableId;
+ private int numberOfColumns = 0;
+ private List<String> activeRow = new ArrayList<String>();
+
+ private String metaDataLocalName;
+ private String metaDataQName;
+
+ PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+ if (pageCount > 0) {
+ doFooter();
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if (parseProperty) {
+ String value = parsePrimitiveElementValue(qName, attributes);
+ if (value != null) {
+ Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
+ if(metaDataKey instanceof Property) {
+ metadata.set((Property)metaDataKey, value);
+ } else {
+ metadata.add((String)metaDataKey, value);
+ }
+ }
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inPart = DocumentPart.METADATA;
+ } else if ("sf:metadata".equals(qName)) {
+ inPart = DocumentPart.METADATA;
+ } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
+ if (pageCount > 0) {
+ doFooter();
+ xhtml.endElement("div");
+ }
+ xhtml.startElement("div");
+ if ("sl:page-group".equals(qName)) {
+ slPageCount++;
+ } else {
+ pageCount++;
+ }
+ doHeader();
+ } else if ("sf:p".equals(qName)) {
+ if (pageCount+slPageCount > 0) {
+ inPart = DocumentPart.PARSABLE_TEXT;
+ xhtml.startElement("p");
+ }
+ } else if ("sf:attachment".equals(qName)) {
+ String kind = attributes.getValue("sf:kind");
+ if ("tabular-attachment".equals(kind)) {
+ activeTableId = attributes.getValue("sfa:ID");
+ tableData.put(activeTableId, new ArrayList<List<String>>());
+ }
+ } else if ("sf:attachment-ref".equals(qName)) {
+ String idRef = attributes.getValue("sfa:IDREF");
+ outputTable(idRef);
+ } else if ("sf:headers".equals(qName)) {
+ headers = new HeaderFooter(qName);
+ inPart = DocumentPart.HEADERS;
+ } else if ("sf:footers".equals(qName)) {
+ footers = new HeaderFooter(qName);
+ inPart = DocumentPart.FOOTERS;
+ } else if ("sf:header".equals(qName)) {
+ inPart = headers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:footer".equals(qName)) {
+ inPart = footers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:page-number".equals(qName)) {
+ if (inPart == DocumentPart.FOOTER_ODD
+ || inPart == DocumentPart.FOOTER_FIRST
+ || inPart == DocumentPart.FOOTER_EVEN) {
+ // We are in a footer
+ footers.hasAutoPageNumber = true;
+ footers.autoPageNumberFormat = attributes.getValue("sf:format");
+ } else {
+ headers.hasAutoPageNumber = true;
+ headers.autoPageNumberFormat = attributes.getValue("sf:format");
+ }
+
+ xhtml.characters(Integer.toString(this.pageCount));
+ } else if ("sf:footnotes".equals(qName)) {
+ footnotes = new Footnotes();
+ inPart = DocumentPart.FOOTNOTES;
+ } else if ("sf:footnote-mark".equals(qName)) {
+ footnotes.recordMark(attributes.getValue("sf:mark"));
+ } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ // What about non auto-numbered?
+ String footnoteMark = attributes.getValue("sf:autonumber");
+ if (footnotes != null) {
+ String footnoteText = footnotes.footnotes.get(footnoteMark);
+ if (footnoteText != null) {
+ xhtml.startElement("div", "style", "footnote");
+ xhtml.characters("Footnote:" ); // As shown in Pages
+ xhtml.characters(footnoteText);
+ xhtml.endElement("div");
+ }
+ }
+ } else if ("sf:annotations".equals(qName)) {
+ annotations = new Annotations();
+ inPart = DocumentPart.ANNOTATIONS;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.start(attributes.getValue("sf:target"));
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.startElement("div", "style", "annotated");
+
+ String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+ if (annotationText != null) {
+ xhtml.startElement("div", "style", "annotation");
+ xhtml.characters(annotationText);
+ xhtml.endElement("div");
+ }
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = true;
+ }
+
+ if (activeTableId != null) {
+ parseTableData(qName, attributes);
+ }
+
+ if (inPart == DocumentPart.METADATA) {
+ metaDataLocalName = localName;
+ metaDataQName = qName;
+ parseProperty = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+ metaDataLocalName = null;
+ parseProperty = false;
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inPart = null;
+ } else if ("sf:metadata".equals(qName)) {
+ inPart = null;
+ } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
+ inPart = null;
+ xhtml.endElement("p");
+ } else if ("sf:attachment".equals(qName)) {
+ activeTableId = null;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.end();
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.endElement("div");
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (length > 0) {
+ if (inPart == DocumentPart.PARSABLE_TEXT) {
+ if (!ghostText) {
+ xhtml.characters(ch, start, length);
+ }
+ } else if(inPart != null) {
+ String str = new String(ch, start, length);
+ if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+ if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
+ if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+ if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
+ if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
+ if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
+ }
+ }
+ }
+
+ private void parseTableData(String qName, Attributes attributes) {
+ if ("sf:grid".equals(qName)) {
+ String numberOfColumns = attributes.getValue("sf:numcols");
+ this.numberOfColumns = Integer.parseInt(numberOfColumns);
+ } else if ("sf:ct".equals(qName)) {
+ activeRow.add(attributes.getValue("sfa:s"));
+
+ if (activeRow.size() >= 3) {
+ tableData.get(activeTableId).add(activeRow);
+ activeRow = new ArrayList<String>();
+ }
+ }
+ }
+
+ private void outputTable(String idRef) throws SAXException {
+ List<List<String>> tableData = this.tableData.get(idRef);
+ if (tableData != null) {
+ xhtml.startElement("table");
+ for (List<String> row : tableData) {
+ xhtml.startElement("tr");
+ for (String cell : row) {
+ xhtml.element("td", cell);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+ }
+
+ /**
+ * Returns a resolved key that is common in other document types or
+ * returns the specified metaDataLocalName if no common key could be found.
+ * The key could be a simple String key, or could be a {@link Property}
+ *
+ * @param metaDataLocalName The localname of the element containing metadata
+ * @return a resolved key that is common in other document types
+ */
+ private Object resolveMetaDataKey(String metaDataLocalName) {
+ Object metaDataKey = metaDataLocalName;
+ if ("sf:authors".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.CREATOR;
+ } else if ("sf:title".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.TITLE;
+ } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.CREATED;
+ } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+ metaDataKey = Metadata.LAST_MODIFIED;
+ } else if ("sl:language".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.LANGUAGE;
+ }
+ return metaDataKey;
+ }
+
+ /**
+ * Returns the value of a primitive element e.g.:
+ * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute
+ * <sl:string sfa:string="en"/> = the string attribute
+ * <p>
+ * Returns <code>null</code> if the value could not be extracted from
+ * the list of attributes.
+ *
+ * @param qName The fully qualified name of the element containing
+ * the value to extract
+ * @param attributes The list of attributes of which one contains the
+ * value to be extracted
+ * @return the value of a primitive element
+ */
+ private String parsePrimitiveElementValue(
+ String qName, Attributes attributes) {
+ if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+ return attributes.getValue("sfa:string");
+ } else if ("sl:number".equals(qName)) {
+ return attributes.getValue("sfa:number");
+ } else if ("sl:date".equals(qName)) {
+ return attributes.getValue("sf:val");
+ }
+
+ return null;
+ }
+
+ private void doHeader() throws SAXException {
+ if (headers != null) {
+ headers.output("header");
+ }
+ }
+ private void doFooter() throws SAXException {
+ if (footers != null) {
+ footers.output("footer");
+ }
+ }
+
+ /**
+ * Represents the Headers or Footers in a document
+ */
+ private class HeaderFooter {
+ private String type; // sf:headers or sf:footers
+ private String defaultOdd;
+ private String defaultEven;
+ private String defaultFirst;
+ private boolean hasAutoPageNumber;
+ private String autoPageNumberFormat;
+ // TODO Can there be custom ones?
+
+ private HeaderFooter(String type) {
+ this.type = type;
+ }
+ private DocumentPart identifyPart(String name) {
+ if("SFWPDefaultOddHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_ODD;
+ if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_EVEN;
+ if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_FIRST;
+
+ if("SFWPDefaultOddFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_ODD;
+ if("SFWPDefaultEvenFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_EVEN;
+ if("SFWPDefaultFirstFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_FIRST;
+
+ return null;
+ }
+ private void output(String what) throws SAXException {
+ String text = null;
+ if (pageCount == 1 && defaultFirst != null) {
+ text = defaultFirst;
+ } else if (pageCount % 2 == 0 && defaultEven != null) {
+ text = defaultEven;
+ } else {
+ text = defaultOdd;
+ }
+
+ if (text != null) {
+ xhtml.startElement("div", "class", "header");
+ xhtml.characters(text);
+ if (hasAutoPageNumber) {
+ if (autoPageNumberFormat == null) { // raw number
+ xhtml.characters("\t" + pageCount);
+ } else if (autoPageNumberFormat.equals("upper-roman")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-roman")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
+ } else if (autoPageNumberFormat.equals("upper-alpha")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-alpha")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
+ }
+ }
+ xhtml.endElement("div");
+ }
+ }
+ }
+ /**
+ * Represents Footnotes in a document. The way these work
+ * in the file format isn't very clean...
+ */
+ private static class Footnotes {
+ /** Mark -> Text */
+ Map<String,String> footnotes = new HashMap<String, String>();
+ String lastSeenMark = null;
+
+ /**
+ * Normally happens before the text of the mark
+ */
+ private void recordMark(String mark) {
+ lastSeenMark = mark;
+ }
+ private void text(String text) {
+ if (lastSeenMark != null) {
+ if (footnotes.containsKey(lastSeenMark)) {
+ text = footnotes.get(lastSeenMark) + text;
+ }
+ footnotes.put(lastSeenMark, text);
+ }
+ }
+ }
+ /**
+ * Represents Annotations in a document. We currently
+ * just grab all the sf:p text in each one
+ */
+ private class Annotations {
+ /** ID -> Text */
+ Map<String,String> annotations = new HashMap<String, String>();
+ String currentID = null;
+ StringBuffer currentText = null;
+
+ private void start(String id) {
+ currentID = id;
+ currentText = new StringBuffer();
+ }
+ private void text(String text) {
+ if (text != null && text.length() > 0 && currentText != null) {
+ currentText.append(text);
+ }
+ }
+ private void end() {
+ if (currentText.length() > 0) {
+ annotations.put(currentID, currentText.toString());
+ currentID = null;
+ currentText = null;
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;
+import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
+import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream;
+import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for various compression formats.
+ */
+public class CompressorParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 2793565792967222459L;
+
+ private static final MediaType BZIP = MediaType.application("x-bzip");
+ private static final MediaType BZIP2 = MediaType.application("x-bzip2");
+ private static final MediaType GZIP = MediaType.application("gzip");
+ private static final MediaType GZIP_ALT = MediaType.application("x-gzip");
+ private static final MediaType COMPRESS = MediaType.application("x-compress");
+ private static final MediaType XZ = MediaType.application("x-xz");
+ private static final MediaType PACK = MediaType.application("x-java-pack200");
+ private static final MediaType SNAPPY = MediaType.application("x-snappy-framed");
+ private static final MediaType ZLIB = MediaType.application("zlib");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB);
+
+ static MediaType getMediaType(CompressorInputStream stream) {
+ // TODO Add support for the remaining CompressorInputStream formats:
+ // LZMACompressorInputStream
+ // LZWInputStream -> UnshrinkingInputStream
+ if (stream instanceof BZip2CompressorInputStream) {
+ return BZIP2;
+ } else if (stream instanceof GzipCompressorInputStream) {
+ return GZIP;
+ } else if (stream instanceof XZCompressorInputStream) {
+ return XZ;
+ } else if (stream instanceof DeflateCompressorInputStream) {
+ return ZLIB;
+ } else if (stream instanceof ZCompressorInputStream) {
+ return COMPRESS;
+ } else if (stream instanceof Pack200CompressorInputStream) {
+ return PACK;
+ } else if (stream instanceof FramedSnappyCompressorInputStream ||
+ stream instanceof SnappyCompressorInputStream) {
+ // TODO Add unit tests for this format
+ return SNAPPY;
+ } else {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // At the end we want to close the compression stream to release
+ // any associated resources, but the underlying document stream
+ // should not be closed
+ stream = new CloseShieldInputStream(stream);
+
+ // Ensure that the stream supports the mark feature
+ stream = new BufferedInputStream(stream);
+
+ CompressorInputStream cis;
+ try {
+ CompressorParserOptions options =
+ context.get(CompressorParserOptions.class, new CompressorParserOptions() {
+ public boolean decompressConcatenated(Metadata metadata) {
+ return false;
+ }
+ });
+ CompressorStreamFactory factory =
+ new CompressorStreamFactory(options.decompressConcatenated(metadata));
+ cis = factory.createCompressorInputStream(stream);
+ } catch (CompressorException e) {
+ throw new TikaException("Unable to uncompress document stream", e);
+ }
+
+ MediaType type = getMediaType(cis);
+ if (!type.equals(MediaType.OCTET_STREAM)) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ try {
+ Metadata entrydata = new Metadata();
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ if (name.endsWith(".tbz")) {
+ name = name.substring(0, name.length() - 4) + ".tar";
+ } else if (name.endsWith(".tbz2")) {
+ name = name.substring(0, name.length() - 5) + ".tar";
+ } else if (name.endsWith(".bz")) {
+ name = name.substring(0, name.length() - 3);
+ } else if (name.endsWith(".bz2")) {
+ name = name.substring(0, name.length() - 4);
+ } else if (name.endsWith(".xz")) {
+ name = name.substring(0, name.length() - 3);
+ } else if (name.endsWith(".zlib")) {
+ name = name.substring(0, name.length() - 5);
+ } else if (name.endsWith(".pack")) {
+ name = name.substring(0, name.length() - 5);
+ } else if (name.length() > 0) {
+ name = GzipUtils.getUncompressedFilename(name);
+ }
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+
+ // Use the delegate parser to parse the compressed document
+ EmbeddedDocumentExtractor extractor = context.get(
+ EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+ if (extractor.shouldParseEmbedded(entrydata)) {
+ extractor.parseEmbedded(cis, xhtml, entrydata, true);
+ }
+ } finally {
+ cis.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Interface for setting options for the {@link CompressorParser} by passing
+ * via the {@link ParseContext}.
+ */
+public interface CompressorParserOptions {
+
+ /**
+ * @param metadata document metadata
+ * @return whether to decompress concatenated streams or not
+ */
+ boolean decompressConcatenated(Metadata metadata);
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.PasswordRequiredException;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for various packaging formats. Package entries will be written to
+ * the XHTML event stream as <div class="package-entry"> elements that
+ * contain the (optional) entry name as a <h1> element and the full
+ * structured body content of the parsed entry.
+ * <p>
+ * User must have JCE Unlimited Strength jars installed for encryption to
+ * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars
+ * are not installed, an IOException will be thrown, and potentially
+ * wrapped in a TikaException.
+ */
+public class PackageParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -5331043266963888708L;
+
+ private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
+ private static final MediaType JAR = MediaType.application("java-archive");
+ private static final MediaType AR = MediaType.application("x-archive");
+ private static final MediaType CPIO = MediaType.application("x-cpio");
+ private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
+ private static final MediaType TAR = MediaType.application("x-tar");
+ private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+
+ static MediaType getMediaType(ArchiveInputStream stream) {
+ if (stream instanceof JarArchiveInputStream) {
+ return JAR;
+ } else if (stream instanceof ZipArchiveInputStream) {
+ return ZIP;
+ } else if (stream instanceof ArArchiveInputStream) {
+ return AR;
+ } else if (stream instanceof CpioArchiveInputStream) {
+ return CPIO;
+ } else if (stream instanceof DumpArchiveInputStream) {
+ return DUMP;
+ } else if (stream instanceof TarArchiveInputStream) {
+ return TAR;
+ } else if (stream instanceof SevenZWrapper) {
+ return SEVENZ;
+ } else {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ static boolean isZipArchive(MediaType type) {
+ return type.equals(ZIP) || type.equals(JAR);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Ensure that the stream supports the mark feature
+ if (! TikaInputStream.isTikaInputStream(stream))
+ stream = new BufferedInputStream(stream);
+
+
+ TemporaryResources tmp = new TemporaryResources();
+ ArchiveInputStream ais = null;
+ try {
+ ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+ // At the end we want to close the archive stream to release
+ // any associated resources, but the underlying document stream
+ // should not be closed
+ ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
+
+ } catch (StreamingNotSupportedException sne) {
+ // Most archive formats work on streams, but a few need files
+ if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+ // Rework as a file, and wrap
+ stream.reset();
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+
+ // Seven Zip suports passwords, was one given?
+ String password = null;
+ PasswordProvider provider = context.get(PasswordProvider.class);
+ if (provider != null) {
+ password = provider.getPassword(metadata);
+ }
+
+ SevenZFile sevenz;
+ if (password == null) {
+ sevenz = new SevenZFile(tstream.getFile());
+ } else {
+ sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
+ }
+
+ // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
+ ais = new SevenZWrapper(sevenz);
+ } else {
+ tmp.close();
+ throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+ }
+ } catch (ArchiveException e) {
+ tmp.close();
+ throw new TikaException("Unable to unpack document stream", e);
+ }
+
+ MediaType type = getMediaType(ais);
+ if (!type.equals(MediaType.OCTET_STREAM)) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ }
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor extractor = context.get(
+ EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ try {
+ ArchiveEntry entry = ais.getNextEntry();
+ while (entry != null) {
+ if (!entry.isDirectory()) {
+ parseEntry(ais, entry, extractor, xhtml);
+ }
+ entry = ais.getNextEntry();
+ }
+ } catch (UnsupportedZipFeatureException zfe) {
+ // If it's an encrypted document of unknown password, report as such
+ if (zfe.getFeature() == Feature.ENCRYPTION) {
+ throw new EncryptedDocumentException(zfe);
+ }
+ // Otherwise fall through to raise the exception as normal
+ } catch (PasswordRequiredException pre) {
+ throw new EncryptedDocumentException(pre);
+ } finally {
+ ais.close();
+ tmp.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseEntry(
+ ArchiveInputStream archive, ArchiveEntry entry,
+ EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ String name = entry.getName();
+ if (archive.canReadEntryData(entry)) {
+ // Fetch the metadata on the entry contained in the archive
+ Metadata entrydata = handleEntryMetadata(name, null,
+ entry.getLastModifiedDate(), entry.getSize(), xhtml);
+
+ // Recurse into the entry if desired
+ if (extractor.shouldParseEmbedded(entrydata)) {
+ // For detectors to work, we need a mark/reset supporting
+ // InputStream, which ArchiveInputStream isn't, so wrap
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(archive, tmp);
+ extractor.parseEmbedded(tis, xhtml, entrydata, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ } else if (name != null && name.length() > 0) {
+ xhtml.element("p", name);
+ }
+ }
+
+ protected static Metadata handleEntryMetadata(
+ String name, Date createAt, Date modifiedAt,
+ Long size, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ Metadata entrydata = new Metadata();
+ if (createAt != null) {
+ entrydata.set(TikaCoreProperties.CREATED, createAt);
+ }
+ if (modifiedAt != null) {
+ entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+ }
+ if (size != null) {
+ entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+ }
+ if (name != null && name.length() > 0) {
+ name = name.replace("\\", "/");
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", name);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+ }
+ return entrydata;
+ }
+
+ // Pending a fix for COMPRESS-269, we have to wrap ourselves
+ private static class SevenZWrapper extends ArchiveInputStream {
+ private SevenZFile file;
+ private SevenZWrapper(SevenZFile file) {
+ this.file = file;
+ }
+
+ @Override
+ public int read() throws IOException {
+ return file.read();
+ }
+ @Override
+ public int read(byte[] b) throws IOException {
+ return file.read(b);
+ }
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ return file.read(b, off, len);
+ }
+
+ @Override
+ public ArchiveEntry getNextEntry() throws IOException {
+ return file.getNextEntry();
+ }
+
+ @Override
+ public void close() throws IOException {
+ file.close();
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.github.junrar.Archive;
+import com.github.junrar.exception.RarException;
+import com.github.junrar.rarfile.FileHeader;
+
+/**
+ * Parser for Rar files.
+ */
+public class RarParser extends AbstractParser {
+ private static final long serialVersionUID = 6157727985054451501L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.application("x-rar-compressed"));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ EmbeddedDocumentExtractor extractor = context.get(
+ EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ Archive rar = null;
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ rar = new Archive(tis.getFile());
+
+ if (rar.isEncrypted()) {
+ throw new EncryptedDocumentException();
+ }
+
+ //Without this BodyContentHandler does not work
+ xhtml.element("div", " ");
+
+ FileHeader header = rar.nextFileHeader();
+ while (header != null && !Thread.currentThread().isInterrupted()) {
+ if (!header.isDirectory()) {
+ try (InputStream subFile = rar.getInputStream(header)) {
+ Metadata entrydata = PackageParser.handleEntryMetadata(
+ "".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(),
+ header.getCTime(), header.getMTime(),
+ header.getFullUnpackSize(),
+ xhtml
+ );
+
+ if (extractor.shouldParseEmbedded(entrydata)) {
+ extractor.parseEmbedded(subFile, handler, entrydata, true);
+ }
+ }
+ }
+
+ header = rar.nextFileHeader();
+ }
+
+ } catch (RarException e) {
+ throw new TikaException("RarParser Exception", e);
+ } finally {
+ if (rar != null)
+ rar.close();
+
+ }
+
+ xhtml.endDocument();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * A detector that works on Zip documents and other archive and compression
+ * formats to figure out exactly what the file is.
+ */
+public class ZipContainerDetector implements Detector {
+ private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String VISIO_DOCUMENT =
+ "http://schemas.microsoft.com/visio/2010/relationships/document";
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String STRICT_CORE_DOCUMENT =
+ "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 2891763938430295453L;
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // Check if we have access to the document
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(input, tmp);
+
+ byte[] prefix = new byte[1024]; // enough for all known formats
+ int length = tis.peek(prefix);
+
+ MediaType type = detectArchiveFormat(prefix, length);
+ if (PackageParser.isZipArchive(type)
+ && TikaInputStream.isTikaInputStream(input)) {
+ return detectZipFormat(tis);
+ } else if (!type.equals(MediaType.OCTET_STREAM)) {
+ return type;
+ } else {
+ return detectCompressorFormat(prefix, length);
+ }
+ } finally {
+ try {
+ tmp.dispose();
+ } catch (TikaException e) {
+ // ignore
+ }
+ }
+ }
+
+ private static MediaType detectCompressorFormat(byte[] prefix, int length) {
+ try {
+ CompressorStreamFactory factory = new CompressorStreamFactory();
+ CompressorInputStream cis = factory.createCompressorInputStream(
+ new ByteArrayInputStream(prefix, 0, length));
+ try {
+ return CompressorParser.getMediaType(cis);
+ } finally {
+ IOUtils.closeQuietly(cis);
+ }
+ } catch (CompressorException e) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+ try {
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
+ ArchiveInputStream ais = factory.createArchiveInputStream(
+ new ByteArrayInputStream(prefix, 0, length));
+ try {
+ if ((ais instanceof TarArchiveInputStream)
+ && !TarArchiveInputStream.matches(prefix, length)) {
+ // ArchiveStreamFactory is too relaxed, see COMPRESS-117
+ return MediaType.OCTET_STREAM;
+ } else {
+ return PackageParser.getMediaType(ais);
+ }
+ } finally {
+ IOUtils.closeQuietly(ais);
+ }
+ } catch (ArchiveException e) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ private static MediaType detectZipFormat(TikaInputStream tis) {
+ try {
+ ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+ try {
+ MediaType type = detectOpenDocument(zip);
+ if (type == null) {
+ type = detectOPCBased(zip, tis);
+ }
+ if (type == null) {
+ type = detectIWork(zip);
+ }
+ if (type == null) {
+ type = detectJar(zip);
+ }
+ if (type == null) {
+ type = detectKmz(zip);
+ }
+ if (type == null) {
+ type = detectIpa(zip);
+ }
+ if (type != null) {
+ return type;
+ }
+ } finally {
+ // TODO: shouldn't we record the open
+ // container so it can be later
+ // reused...?
+ // tis.setOpenContainer(zip);
+ try {
+ zip.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ // Fallback: it's still a zip file, we just don't know what kind of one
+ return MediaType.APPLICATION_ZIP;
+ }
+
+ /**
+ * OpenDocument files, along with EPub files and ASiC ones, have a
+ * mimetype entry in the root of their Zip file. This entry contains
+ * the mimetype of the overall file, stored as a single string.
+ */
+ private static MediaType detectOpenDocument(ZipFile zip) {
+ try {
+ ZipArchiveEntry mimetype = zip.getEntry("mimetype");
+ if (mimetype != null) {
+ try (InputStream stream = zip.getInputStream(mimetype)) {
+ return MediaType.parse(IOUtils.toString(stream, UTF_8));
+ }
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+ try {
+ if (zip.getEntry("_rels/.rels") != null
+ || zip.getEntry("[Content_Types].xml") != null) {
+ // Use POI to open and investigate it for us
+ OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+ stream.setOpenContainer(pkg);
+
+ // Is at an OOXML format?
+ MediaType type = detectOfficeOpenXML(pkg);
+ if (type != null) return type;
+
+ // Is it XPS format?
+ type = detectXPSOPC(pkg);
+ if (type != null) return type;
+
+ // Is it an AutoCAD format?
+ type = detectAutoCADOPC(pkg);
+ if (type != null) return type;
+
+ // We don't know what it is, sorry
+ return null;
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ return null;
+ } catch (RuntimeException e) {
+ return null;
+ } catch (InvalidFormatException e) {
+ return null;
+ }
+ }
+ /**
+ * Detects the type of an OfficeOpenXML (OOXML) file from
+ * opened Package
+ */
+ public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+ // Check for the normal Office core document
+ PackageRelationshipCollection core =
+ pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+ // Otherwise check for some other Office core document types
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+ }
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+ }
+
+ // If we didn't find a single core document of any type, skip detection
+ if (core.size() != 1) {
+ // Invalid OOXML Package received
+ return null;
+ }
+
+ // Get the type of the core document part
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+ String coreType = corePart.getContentType();
+
+ // Turn that into the type of the overall document
+ String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+ // The Macro Enabled formats are a little special
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+ docType = docType.toLowerCase(Locale.ROOT) + ".12";
+ }
+
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+ docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+ }
+
+ // Build the MediaType object and return
+ return MediaType.parse(docType);
+ }
+ /**
+ * Detects Open XML Paper Specification (XPS)
+ */
+ private static MediaType detectXPSOPC(OPCPackage pkg) {
+ PackageRelationshipCollection xps =
+ pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+ if (xps.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ } else {
+ // Non-XPS Package received
+ return null;
+ }
+ }
+ /**
+ * Detects AutoCAD formats that live in OPC packaging
+ */
+ private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+ PackageRelationshipCollection dwfxSeq =
+ pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+ if (dwfxSeq.size() == 1) {
+ return MediaType.parse("model/vnd.dwfx+xps");
+ } else {
+ // Non-AutoCAD Package received
+ return null;
+ }
+ }
+
+ private static MediaType detectIWork(ZipFile zip) {
+ if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
+ // Locate the appropriate index file entry, and reads from that
+ // the root element of the document. That is used to the identify
+ // the correct type of the keynote container.
+ for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
+ IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip);
+ if (type != null) {
+ return type.getType();
+ }
+ }
+
+ // Not sure, fallback to the container type
+ return MediaType.application("vnd.apple.iwork");
+ } else {
+ return null;
+ }
+ }
+
+ private static MediaType detectJar(ZipFile zip) {
+ if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
+ // It's a Jar file, or something based on Jar
+
+ // Is it an Android APK?
+ if (zip.getEntry("AndroidManifest.xml") != null) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ // Check for WAR and EAR
+ if (zip.getEntry("WEB-INF/") != null) {
+ return MediaType.application("x-tika-java-web-archive");
+ }
+ if (zip.getEntry("META-INF/application.xml") != null) {
+ return MediaType.application("x-tika-java-enterprise-archive");
+ }
+
+ // Looks like a regular Jar Archive
+ return MediaType.application("java-archive");
+ } else {
+ // Some Android APKs miss the default Manifest
+ if (zip.getEntry("AndroidManifest.xml") != null) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ return null;
+ }
+ }
+
+ private static MediaType detectKmz(ZipFile zip) {
+ boolean kmlFound = false;
+
+ Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ String name = entry.getName();
+ if (!entry.isDirectory()
+ && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
+ if (name.endsWith(".kml") && !kmlFound) {
+ kmlFound = true;
+ } else {
+ return null;
+ }
+ }
+ }
+
+ if (kmlFound) {
+ return MediaType.application("vnd.google-earth.kmz");
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * To be considered as an IPA file, it needs to match all of these
+ */
+ private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+ private static final long serialVersionUID = 6545295886322115362L;
+ {
+ add(Pattern.compile("^Payload/$"));
+ add(Pattern.compile("^Payload/.*\\.app/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+ add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+ add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+ }};
+ @SuppressWarnings("unchecked")
+ private static MediaType detectIpa(ZipFile zip) {
+ // Note - consider generalising this logic, if another format needs many regexp matching
+ Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+
+ Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ String name = entry.getName();
+
+ Iterator<Pattern> ip = tmpPatterns.iterator();
+ while (ip.hasNext()) {
+ if (ip.next().matcher(name).matches()) {
+ ip.remove();
+ }
+ }
+ if (tmpPatterns.isEmpty()) {
+ // We've found everything we need to find
+ return MediaType.application("x-itunes-ipa");
+ }
+ }
+
+ // If we get here, not all required entries were found
+ return null;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector Sat Jan 16 18:23:01 2016
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.pkg.ZipContainerDetector
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.pkg.CompressorParser
+org.apache.tika.parser.pkg.PackageParser
+org.apache.tika.parser.pkg.RarParser
+org.apache.tika.parser.iwork.IWorkPackageParser
+
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Test class for the <code>AutoPageNumberUtils</code> helper class.
+ */
+public class AutoPageNumberUtilsTest {
+
+ /**
+ * Check upper-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testAlphaUpper() {
+ assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+ assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+ assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+ assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+ assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+ assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+ }
+
+ /**
+ * Check lower-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testAlphaLower() {
+ assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+ assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+ assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+ assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+ assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+ assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+ }
+
+ /**
+ * Check upper-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testRomanUpper() {
+ assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+ assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+ assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+ }
+
+ /**
+ * Check lower-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testRomanLower() {
+ assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+ assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+ assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+ }
+
+}