You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [18/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class NumbersContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSheet = false;
+
+    private boolean inText = false;
+    private boolean parseText = false;
+
+    private boolean inMetadata = false;
+    private Property metadataKey;
+    private String metadataPropertyQName;
+
+    private boolean inTable = false;
+    private int numberOfSheets = 0;
+    private int numberOfColumns = -1;
+    private int currentColumn = 0;
+
+    private Map<String, String> menuItems = new HashMap<String, String>();
+    private String currentMenuItemId;
+
+    NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = true;
+            numberOfSheets++;
+            xhtml.startElement("div");
+            String sheetName = attributes.getValue("ls:workspace-name");
+            metadata.add("sheetNames", sheetName);
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = true;
+            xhtml.startElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = true;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = true;
+            return;
+        }
+
+        if (inMetadata && metadataKey == null) {
+            metadataKey = resolveMetadataKey(localName);
+            metadataPropertyQName = qName;
+        }
+
+        if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
+            metadata.add(metadataKey, attributes.getValue("sfa:string"));
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            String tableName = attributes.getValue("sf:name");
+            xhtml.startElement("div");
+            xhtml.characters(tableName);
+            xhtml.endElement("div");
+            inTable = true;
+            xhtml.startElement("table");
+            xhtml.startElement("tr");
+            currentColumn = 0;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+            menuItems = new HashMap<String, String>();
+        }
+
+        if (inTable && "sf:grid".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
+        }
+
+        if (menuItems != null && "sf:t".equals(qName)) {
+            currentMenuItemId = attributes.getValue("sfa:ID");
+        }
+
+        if (currentMenuItemId != null && "sf:ct".equals(qName)) {
+            menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
+        }
+
+        if (inTable && "sf:ct".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sfa:s"));
+            currentColumn++;
+        }
+
+        if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sf:v"));
+            currentColumn++;
+        }
+
+        if (inTable && "sf:proxied-cell-ref".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
+            currentColumn++;
+        }
+
+        if ("sf:chart-name".equals(qName)) {
+            // Extract chart name:
+            xhtml.startElement("div", "class", "chart");
+            xhtml.startElement("h1");
+            xhtml.characters(attributes.getValue("sfa:string"));
+            xhtml.endElement("h1");
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (parseText && length > 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = false;
+            xhtml.endElement("div");
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = false;
+            xhtml.endElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = false;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = false;
+        }
+
+        if (inMetadata && qName.equals(metadataPropertyQName)) {
+            metadataPropertyQName = null;
+            metadataKey = null;
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            inTable = false;
+            xhtml.endElement("tr");
+            xhtml.endElement("table");
+        }
+
+        if (currentMenuItemId != null && "sf:t".equals(qName)) {
+            currentMenuItemId = null;
+        }
+    }
+
+    private Property resolveMetadataKey(String localName) {
+        if ("authors".equals(localName)) {
+            return TikaCoreProperties.CREATOR;
+        }
+        if ("title".equals(localName)) {
+            return TikaCoreProperties.TITLE;
+        }
+        if ("comment".equals(localName)) {
+            return TikaCoreProperties.COMMENTS;
+        }
+        return Property.internalText(localName);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,448 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class PagesContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    /** The (interesting) part of the document we're in. Should be more structured... */
+    private enum DocumentPart {
+       METADATA, PARSABLE_TEXT, 
+       HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+       FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+       FOOTNOTES, ANNOTATIONS;
+    }
+    private DocumentPart inPart = null;
+    private boolean ghostText;
+    
+    private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+    private boolean parseProperty = false;
+    private int pageCount = 0;
+    private int slPageCount = 0;
+
+    private HeaderFooter headers = null;
+    private HeaderFooter footers = null;
+    private Footnotes footnotes = null; 
+    private Annotations annotations = null; 
+    
+    private Map<String, List<List<String>>> tableData =
+        new HashMap<String, List<List<String>>>();
+    private String activeTableId;
+    private int numberOfColumns = 0;
+    private List<String> activeRow = new ArrayList<String>();
+
+    private String metaDataLocalName;
+    private String metaDataQName;
+
+    PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+        if (pageCount > 0) {
+            doFooter();
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if (parseProperty) {
+            String value = parsePrimitiveElementValue(qName, attributes);
+            if (value != null) {
+                Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
+                if(metaDataKey instanceof Property) {
+                    metadata.set((Property)metaDataKey, value);
+                } else {
+                    metadata.add((String)metaDataKey, value);
+                }
+            }
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inPart = DocumentPart.METADATA;
+        } else if ("sf:metadata".equals(qName)) {
+           inPart = DocumentPart.METADATA;
+        } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
+            if (pageCount > 0) {
+                doFooter();
+                xhtml.endElement("div");
+            }
+            xhtml.startElement("div");
+            if ("sl:page-group".equals(qName)) {
+                slPageCount++;
+            } else {
+                pageCount++;
+            }
+            doHeader();
+        } else if ("sf:p".equals(qName)) {
+          if (pageCount+slPageCount > 0) {
+            inPart = DocumentPart.PARSABLE_TEXT;
+            xhtml.startElement("p");
+          }
+        } else if ("sf:attachment".equals(qName)) {
+            String kind = attributes.getValue("sf:kind");
+            if ("tabular-attachment".equals(kind)) {
+                activeTableId = attributes.getValue("sfa:ID");
+                tableData.put(activeTableId, new ArrayList<List<String>>());
+            }
+        } else if ("sf:attachment-ref".equals(qName)) {
+            String idRef = attributes.getValue("sfa:IDREF");
+            outputTable(idRef);
+        } else if ("sf:headers".equals(qName)) {
+            headers = new HeaderFooter(qName);
+            inPart = DocumentPart.HEADERS;
+        } else if ("sf:footers".equals(qName)) {
+           footers = new HeaderFooter(qName);
+           inPart = DocumentPart.FOOTERS;
+        } else if ("sf:header".equals(qName)) {
+            inPart = headers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:footer".equals(qName)) {
+           inPart = footers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:page-number".equals(qName)) {	
+        	if (inPart == DocumentPart.FOOTER_ODD
+        		|| inPart == DocumentPart.FOOTER_FIRST
+        		|| inPart == DocumentPart.FOOTER_EVEN) {
+        		// We are in a footer
+        		footers.hasAutoPageNumber = true;
+        		footers.autoPageNumberFormat = attributes.getValue("sf:format");   
+        	} else {
+        		headers.hasAutoPageNumber = true;
+        		headers.autoPageNumberFormat = attributes.getValue("sf:format");   
+        	}
+
+        	xhtml.characters(Integer.toString(this.pageCount));
+        } else if ("sf:footnotes".equals(qName)) {
+           footnotes = new Footnotes();
+           inPart = DocumentPart.FOOTNOTES;
+        } else if ("sf:footnote-mark".equals(qName)) {
+           footnotes.recordMark(attributes.getValue("sf:mark"));
+        } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           // What about non auto-numbered?
+           String footnoteMark = attributes.getValue("sf:autonumber");
+           if (footnotes != null) {
+              String footnoteText = footnotes.footnotes.get(footnoteMark);
+              if (footnoteText != null) {
+                 xhtml.startElement("div", "style", "footnote");
+                 xhtml.characters("Footnote:" ); // As shown in Pages
+                 xhtml.characters(footnoteText);
+                 xhtml.endElement("div");
+              }
+           }
+        } else if ("sf:annotations".equals(qName)) {
+           annotations = new Annotations();
+           inPart = DocumentPart.ANNOTATIONS;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+           annotations.start(attributes.getValue("sf:target"));
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           xhtml.startElement("div", "style", "annotated");
+           
+           String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+           if (annotationText != null) {
+              xhtml.startElement("div", "style", "annotation");
+              xhtml.characters(annotationText);
+              xhtml.endElement("div");
+           }
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = true;
+        }
+
+        if (activeTableId != null) {
+            parseTableData(qName, attributes);
+        }
+
+        if (inPart == DocumentPart.METADATA) {
+            metaDataLocalName = localName;
+            metaDataQName = qName;
+            parseProperty = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+            metaDataLocalName = null;
+            parseProperty = false;
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inPart = null;
+        } else if ("sf:metadata".equals(qName)) {
+            inPart = null;
+        } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
+            inPart = null;
+            xhtml.endElement("p");
+        } else if ("sf:attachment".equals(qName)) {
+            activeTableId = null;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+            annotations.end();
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+            xhtml.endElement("div");
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (length > 0) {
+           if (inPart == DocumentPart.PARSABLE_TEXT) {
+               if (!ghostText) {
+                   xhtml.characters(ch, start, length);
+               }
+          } else if(inPart != null) {
+              String str = new String(ch, start, length);
+              if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+              if (inPart == DocumentPart.HEADER_EVEN)  headers.defaultEven = str;
+              if (inPart == DocumentPart.HEADER_ODD)   headers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+              if (inPart == DocumentPart.FOOTER_EVEN)  footers.defaultEven = str;
+              if (inPart == DocumentPart.FOOTER_ODD)   footers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTNOTES)    footnotes.text(str);
+              if (inPart == DocumentPart.ANNOTATIONS)  annotations.text(str);
+          }
+        }
+    }
+
+    private void parseTableData(String qName, Attributes attributes) {
+        if ("sf:grid".equals(qName)) {
+            String numberOfColumns = attributes.getValue("sf:numcols");
+            this.numberOfColumns = Integer.parseInt(numberOfColumns);
+        } else if ("sf:ct".equals(qName)) {
+            activeRow.add(attributes.getValue("sfa:s"));
+
+            if (activeRow.size() >= 3) {
+                tableData.get(activeTableId).add(activeRow);
+                activeRow = new ArrayList<String>();
+            }
+        }
+    }
+
+    private void outputTable(String idRef) throws SAXException {
+        List<List<String>> tableData = this.tableData.get(idRef);
+        if (tableData != null) {
+            xhtml.startElement("table");
+            for (List<String> row : tableData) {
+                xhtml.startElement("tr");
+                for (String cell : row) {
+                    xhtml.element("td", cell);
+                }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("table");
+        }
+    }
+
+    /**
+     * Returns a resolved key that is common in other document types or
+     * returns the specified metaDataLocalName if no common key could be found.
+     * The key could be a simple String key, or could be a {@link Property}
+     *
+     * @param metaDataLocalName The localname of the element containing metadata
+     * @return a resolved key that is common in other document types
+     */
+    private Object resolveMetaDataKey(String metaDataLocalName) {
+        Object metaDataKey = metaDataLocalName;
+        if ("sf:authors".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.CREATOR;
+        } else if ("sf:title".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.TITLE;
+        } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.CREATED;
+        } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+            metaDataKey = Metadata.LAST_MODIFIED;
+        } else if ("sl:language".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.LANGUAGE;
+        }
+        return metaDataKey;
+    }
+
+    /**
+     * Returns the value of a primitive element e.g.:
+     * &lt;sl:number sfa:number="0" sfa:type="f"/&gt; - the number attribute
+     * &lt;sl:string sfa:string="en"/&gt; = the string attribute
+     * <p>
+     * Returns <code>null</code> if the value could not be extracted from
+     * the list of attributes.
+     *
+     * @param qName      The fully qualified name of the element containing
+     *                   the value to extract
+     * @param attributes The list of attributes of which one contains the
+     *                   value to be extracted
+     * @return the value of a primitive element
+     */
+    private String parsePrimitiveElementValue(
+            String qName, Attributes attributes) {
+        if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+            return attributes.getValue("sfa:string");
+        } else if ("sl:number".equals(qName)) {
+            return attributes.getValue("sfa:number");
+        } else if ("sl:date".equals(qName)) {
+            return attributes.getValue("sf:val");
+        }
+
+        return null;
+    }
+    
+    private void doHeader() throws SAXException {
+       if (headers != null) {
+          headers.output("header");
+       }
+    }
+    private void doFooter() throws SAXException {
+       if (footers != null) {
+          footers.output("footer");
+       }
+    }
+
+    /**
+     * Represents the Headers or Footers in a document
+     */
+    private class HeaderFooter {
+       private String type; // sf:headers or sf:footers
+       private String defaultOdd;
+       private String defaultEven;
+       private String defaultFirst;
+       private boolean hasAutoPageNumber;
+       private String autoPageNumberFormat;
+       // TODO Can there be custom ones?
+       
+       private HeaderFooter(String type) {
+          this.type = type; 
+       }
+       private DocumentPart identifyPart(String name) {
+          if("SFWPDefaultOddHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_ODD;
+          if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_EVEN;
+          if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_FIRST;
+          
+          if("SFWPDefaultOddFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_ODD;
+          if("SFWPDefaultEvenFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_EVEN;
+          if("SFWPDefaultFirstFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_FIRST;
+          
+          return null;
+       }
+       private void output(String what) throws SAXException {
+          String text = null;
+          if (pageCount == 1 && defaultFirst != null) {
+             text = defaultFirst;
+          } else if (pageCount % 2 == 0 && defaultEven != null) {
+             text = defaultEven;
+          } else {
+             text = defaultOdd;
+          }
+          
+          if (text != null) {
+             xhtml.startElement("div", "class", "header");
+             xhtml.characters(text);
+             if (hasAutoPageNumber) {
+            	 if (autoPageNumberFormat == null) { // raw number
+            		 xhtml.characters("\t" + pageCount);
+            	 } else if (autoPageNumberFormat.equals("upper-roman")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
+            	 } else if (autoPageNumberFormat.equals("lower-roman")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
+            	 } else if (autoPageNumberFormat.equals("upper-alpha")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
+            	 } else if (autoPageNumberFormat.equals("lower-alpha")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
+            	 }
+             }
+             xhtml.endElement("div");
+          }
+       }
+    }
+    /**
+     * Represents Footnotes in a document. The way these work
+     *  in the file format isn't very clean...
+     */
+    private static class Footnotes {
+       /** Mark -> Text */
+       Map<String,String> footnotes = new HashMap<String, String>();
+       String lastSeenMark = null;
+       
+       /**
+        * Normally happens before the text of the mark
+        */
+       private void recordMark(String mark) {
+          lastSeenMark = mark;
+       }
+       private void text(String text) {
+          if (lastSeenMark != null) {
+             if (footnotes.containsKey(lastSeenMark)) {
+                text = footnotes.get(lastSeenMark) + text;
+             }
+             footnotes.put(lastSeenMark, text);
+          }
+       }
+    }
+    /**
+     * Represents Annotations in a document. We currently
+     *  just grab all the sf:p text in each one 
+     */
+    private class Annotations {
+       /** ID -> Text */
+       Map<String,String> annotations = new HashMap<String, String>();
+       String currentID = null;
+       StringBuffer currentText = null;
+       
+       private void start(String id) {
+          currentID = id;
+          currentText = new StringBuffer();
+       }
+       private void text(String text) {
+          if (text != null && text.length() > 0 && currentText != null) {
+             currentText.append(text);
+          }
+       }
+       private void end() {
+          if (currentText.length() > 0) {
+             annotations.put(currentID, currentText.toString());
+             currentID = null;
+             currentText = null;
+          }
+       }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;
+import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
+import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream;
+import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for various compression formats.
+ */
+public class CompressorParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 2793565792967222459L;
+
+    private static final MediaType BZIP = MediaType.application("x-bzip");
+    private static final MediaType BZIP2 = MediaType.application("x-bzip2");
+    private static final MediaType GZIP = MediaType.application("gzip");
+    private static final MediaType GZIP_ALT = MediaType.application("x-gzip");
+    private static final MediaType COMPRESS = MediaType.application("x-compress");
+    private static final MediaType XZ = MediaType.application("x-xz");
+    private static final MediaType PACK = MediaType.application("x-java-pack200");
+    private static final MediaType SNAPPY = MediaType.application("x-snappy-framed");
+    private static final MediaType ZLIB = MediaType.application("zlib");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB);
+
+    static MediaType getMediaType(CompressorInputStream stream) {
+        // TODO Add support for the remaining CompressorInputStream formats:
+        //   LZMACompressorInputStream
+        //   LZWInputStream -> UnshrinkingInputStream
+        if (stream instanceof BZip2CompressorInputStream) {
+            return BZIP2;
+        } else if (stream instanceof GzipCompressorInputStream) {
+            return GZIP;
+        } else if (stream instanceof XZCompressorInputStream) {
+            return XZ;
+        } else if (stream instanceof DeflateCompressorInputStream) {
+            return ZLIB;
+        } else if (stream instanceof ZCompressorInputStream) {
+            return COMPRESS;
+        } else if (stream instanceof Pack200CompressorInputStream) {
+            return PACK;
+        } else if (stream instanceof FramedSnappyCompressorInputStream ||
+                   stream instanceof SnappyCompressorInputStream) {
+            // TODO Add unit tests for this format
+            return SNAPPY;
+        } else {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // At the end we want to close the compression stream to release
+        // any associated resources, but the underlying document stream
+        // should not be closed
+        stream = new CloseShieldInputStream(stream);
+
+        // Ensure that the stream supports the mark feature
+        stream = new BufferedInputStream(stream);
+
+        CompressorInputStream cis;
+        try {
+            CompressorParserOptions options =
+                 context.get(CompressorParserOptions.class, new CompressorParserOptions() {
+                     public boolean decompressConcatenated(Metadata metadata) {
+                         return false;
+                     }
+                 });
+            CompressorStreamFactory factory = 
+                    new CompressorStreamFactory(options.decompressConcatenated(metadata));
+            cis = factory.createCompressorInputStream(stream);
+        } catch (CompressorException e) {
+            throw new TikaException("Unable to uncompress document stream", e);
+        }
+
+        MediaType type = getMediaType(cis);
+        if (!type.equals(MediaType.OCTET_STREAM)) {
+            metadata.set(CONTENT_TYPE, type.toString());
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        try {
+            Metadata entrydata = new Metadata();
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                if (name.endsWith(".tbz")) {
+                    name = name.substring(0, name.length() - 4) + ".tar";
+                } else if (name.endsWith(".tbz2")) {
+                    name = name.substring(0, name.length() - 5) + ".tar";
+                } else if (name.endsWith(".bz")) {
+                    name = name.substring(0, name.length() - 3);
+                } else if (name.endsWith(".bz2")) {
+                    name = name.substring(0, name.length() - 4);
+                } else if (name.endsWith(".xz")) {
+                    name = name.substring(0, name.length() - 3);
+                } else if (name.endsWith(".zlib")) {
+                    name = name.substring(0, name.length() - 5);
+                } else if (name.endsWith(".pack")) {
+                    name = name.substring(0, name.length() - 5);
+                } else if (name.length() > 0) {
+                    name = GzipUtils.getUncompressedFilename(name);
+                }
+                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            }
+
+            // Use the delegate parser to parse the compressed document
+            EmbeddedDocumentExtractor extractor = context.get(
+                    EmbeddedDocumentExtractor.class,
+                    new ParsingEmbeddedDocumentExtractor(context));
+            if (extractor.shouldParseEmbedded(entrydata)) {
+                extractor.parseEmbedded(cis, xhtml, entrydata, true);
+            }
+        } finally {
+            cis.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/CompressorParserOptions.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Interface for setting options for the {@link CompressorParser} by passing
+ * via the {@link ParseContext}.
+ */
+public interface CompressorParserOptions {
+
+    /**
+     * @param metadata document metadata
+     * @return whether to decompress concatenated streams or not
+     */
+    boolean decompressConcatenated(Metadata metadata);
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.PasswordRequiredException;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for various packaging formats. Package entries will be written to
+ * the XHTML event stream as &lt;div class="package-entry"&gt; elements that
+ * contain the (optional) entry name as a &lt;h1&gt; element and the full
+ * structured body content of the parsed entry.
+ * <p>
+ * User must have JCE Unlimited Strength jars installed for encryption to
+ * work with 7Z files (see: COMPRESS-299 and TIKA-1521).  If the jars
+ * are not installed, an IOException will be thrown, and potentially
+ * wrapped in a TikaException.
+ */
+public class PackageParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -5331043266963888708L;
+
+    private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
+    private static final MediaType JAR = MediaType.application("java-archive");
+    private static final MediaType AR = MediaType.application("x-archive");
+    private static final MediaType CPIO = MediaType.application("x-cpio");
+    private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
+    private static final MediaType TAR = MediaType.application("x-tar");
+    private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+
+    static MediaType getMediaType(ArchiveInputStream stream) {
+        if (stream instanceof JarArchiveInputStream) {
+            return JAR;
+        } else if (stream instanceof ZipArchiveInputStream) {
+            return ZIP;
+        } else if (stream instanceof ArArchiveInputStream) {
+            return AR;
+        } else if (stream instanceof CpioArchiveInputStream) {
+            return CPIO;
+        } else if (stream instanceof DumpArchiveInputStream) {
+            return DUMP;
+        } else if (stream instanceof TarArchiveInputStream) {
+            return TAR;
+        } else if (stream instanceof SevenZWrapper) {
+            return SEVENZ;
+        } else {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    static boolean isZipArchive(MediaType type) {
+        return type.equals(ZIP) || type.equals(JAR);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+       
+        // Ensure that the stream supports the mark feature
+        if (! TikaInputStream.isTikaInputStream(stream))
+            stream = new BufferedInputStream(stream);
+        
+        
+        TemporaryResources tmp = new TemporaryResources();
+        ArchiveInputStream ais = null;
+        try {
+            ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+            // At the end we want to close the archive stream to release
+            // any associated resources, but the underlying document stream
+            // should not be closed
+            ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
+            
+        } catch (StreamingNotSupportedException sne) {
+            // Most archive formats work on streams, but a few need files
+            if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+                // Rework as a file, and wrap
+                stream.reset();
+                TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+                
+                // Seven Zip suports passwords, was one given?
+                String password = null;
+                PasswordProvider provider = context.get(PasswordProvider.class);
+                if (provider != null) {
+                    password = provider.getPassword(metadata);
+                }
+                
+                SevenZFile sevenz;
+                if (password == null) {
+                    sevenz = new SevenZFile(tstream.getFile());
+                } else {
+                    sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
+                }
+                
+                // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
+                ais = new SevenZWrapper(sevenz);
+            } else {
+                tmp.close();
+                throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+            }
+        } catch (ArchiveException e) {
+            tmp.close();
+            throw new TikaException("Unable to unpack document stream", e);
+        }
+
+        MediaType type = getMediaType(ais);
+        if (!type.equals(MediaType.OCTET_STREAM)) {
+            metadata.set(CONTENT_TYPE, type.toString());
+        }
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor extractor = context.get(
+                EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        try {
+            ArchiveEntry entry = ais.getNextEntry();
+            while (entry != null) {
+                if (!entry.isDirectory()) {
+                    parseEntry(ais, entry, extractor, xhtml);
+                }
+                entry = ais.getNextEntry();
+            }
+        } catch (UnsupportedZipFeatureException zfe) {
+            // If it's an encrypted document of unknown password, report as such
+            if (zfe.getFeature() == Feature.ENCRYPTION) {
+                throw new EncryptedDocumentException(zfe);
+            }
+            // Otherwise fall through to raise the exception as normal
+        } catch (PasswordRequiredException pre) {
+            throw new EncryptedDocumentException(pre);
+        } finally {
+            ais.close();
+            tmp.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseEntry(
+            ArchiveInputStream archive, ArchiveEntry entry,
+            EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        String name = entry.getName();
+        if (archive.canReadEntryData(entry)) {
+            // Fetch the metadata on the entry contained in the archive
+            Metadata entrydata = handleEntryMetadata(name, null, 
+                    entry.getLastModifiedDate(), entry.getSize(), xhtml);
+            
+            // Recurse into the entry if desired
+            if (extractor.shouldParseEmbedded(entrydata)) {
+                // For detectors to work, we need a mark/reset supporting
+                // InputStream, which ArchiveInputStream isn't, so wrap
+                TemporaryResources tmp = new TemporaryResources();
+                try {
+                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
+                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
+                } finally {
+                    tmp.dispose();
+                }
+            }
+        } else if (name != null && name.length() > 0) {
+            xhtml.element("p", name);
+        }
+    }
+    
+    protected static Metadata handleEntryMetadata(
+            String name, Date createAt, Date modifiedAt,
+            Long size, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        Metadata entrydata = new Metadata();
+        if (createAt != null) {
+            entrydata.set(TikaCoreProperties.CREATED, createAt);
+        }
+        if (modifiedAt != null) {
+            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+        }
+        if (size != null) {
+            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+        }
+        if (name != null && name.length() > 0) {
+            name = name.replace("\\", "/");
+            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", name);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+            entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+        }
+        return entrydata;
+    }
+
+    // Pending a fix for COMPRESS-269, we have to wrap ourselves
+    private static class SevenZWrapper extends ArchiveInputStream {
+        private SevenZFile file;
+        private SevenZWrapper(SevenZFile file) {
+            this.file = file;
+        }
+        
+        @Override
+        public int read() throws IOException {
+            return file.read();
+        }
+        @Override
+        public int read(byte[] b) throws IOException {
+            return file.read(b);
+        }
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            return file.read(b, off, len);
+        }
+
+        @Override
+        public ArchiveEntry getNextEntry() throws IOException {
+            return file.getNextEntry();
+        }
+        
+        @Override
+        public void close() throws IOException {
+            file.close();
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.github.junrar.Archive;
+import com.github.junrar.exception.RarException;
+import com.github.junrar.rarfile.FileHeader;
+
+/**
+ * Parser for Rar files.
+ */
+public class RarParser extends AbstractParser {
+    private static final long serialVersionUID = 6157727985054451501L;
+    
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("x-rar-compressed"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        EmbeddedDocumentExtractor extractor = context.get(
+                EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        Archive rar = null;
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            rar = new Archive(tis.getFile());
+
+            if (rar.isEncrypted()) {
+                throw new EncryptedDocumentException();
+            }
+
+            //Without this BodyContentHandler does not work
+            xhtml.element("div", " ");
+
+            FileHeader header = rar.nextFileHeader();
+            while (header != null && !Thread.currentThread().isInterrupted()) {
+                if (!header.isDirectory()) {
+                    try (InputStream subFile = rar.getInputStream(header)) {
+                        Metadata entrydata = PackageParser.handleEntryMetadata(
+                                "".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(),
+                                header.getCTime(), header.getMTime(),
+                                header.getFullUnpackSize(),
+                                xhtml
+                        );
+
+                        if (extractor.shouldParseEmbedded(entrydata)) {
+                            extractor.parseEmbedded(subFile, handler, entrydata, true);
+                        }
+                    }
+                }
+
+                header = rar.nextFileHeader();
+            }
+
+        } catch (RarException e) {
+            throw new TikaException("RarParser Exception", e);
+        } finally {
+            if (rar != null)
+                rar.close();
+
+        }
+
+        xhtml.endDocument();
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * A detector that works on Zip documents and other archive and compression
+ * formats to figure out exactly what the file is.
+ */
+public class ZipContainerDetector implements Detector {
+    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String VISIO_DOCUMENT =
+            "http://schemas.microsoft.com/visio/2010/relationships/document";
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String STRICT_CORE_DOCUMENT = 
+            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+    
+    /** Serial version UID */
+    private static final long serialVersionUID = 2891763938430295453L;
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        // Check if we have access to the document
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(input, tmp);
+
+            byte[] prefix = new byte[1024]; // enough for all known formats
+            int length = tis.peek(prefix);
+
+            MediaType type = detectArchiveFormat(prefix, length);
+            if (PackageParser.isZipArchive(type)
+                    && TikaInputStream.isTikaInputStream(input)) {
+                return detectZipFormat(tis);
+            } else if (!type.equals(MediaType.OCTET_STREAM)) {
+                return type;
+            } else {
+                return detectCompressorFormat(prefix, length);
+            }
+        } finally {
+            try {
+                tmp.dispose();
+            } catch (TikaException e) {
+                // ignore
+            }
+        }
+    }
+
+    private static MediaType detectCompressorFormat(byte[] prefix, int length) {
+        try {
+            CompressorStreamFactory factory = new CompressorStreamFactory();
+            CompressorInputStream cis = factory.createCompressorInputStream(
+                    new ByteArrayInputStream(prefix, 0, length));
+            try {
+                return CompressorParser.getMediaType(cis);
+            } finally {
+                IOUtils.closeQuietly(cis);
+            }
+        } catch (CompressorException e) {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+        try {
+            ArchiveStreamFactory factory = new ArchiveStreamFactory();
+            ArchiveInputStream ais = factory.createArchiveInputStream(
+                    new ByteArrayInputStream(prefix, 0, length));
+            try {
+                if ((ais instanceof TarArchiveInputStream)
+                        && !TarArchiveInputStream.matches(prefix, length)) {
+                    // ArchiveStreamFactory is too relaxed, see COMPRESS-117
+                    return MediaType.OCTET_STREAM;
+                } else {
+                    return PackageParser.getMediaType(ais);
+                }
+            } finally {
+                IOUtils.closeQuietly(ais);
+            }
+        } catch (ArchiveException e) {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    private static MediaType detectZipFormat(TikaInputStream tis) {
+        try {
+            ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+            try {
+                MediaType type = detectOpenDocument(zip);
+                if (type == null) {
+                    type = detectOPCBased(zip, tis);
+                }
+                if (type == null) {
+                    type = detectIWork(zip);
+                }
+                if (type == null) {
+                    type = detectJar(zip);
+                }
+                if (type == null) {
+                    type = detectKmz(zip);
+                }
+                if (type == null) {
+                    type = detectIpa(zip);
+                }
+                if (type != null) {
+                    return type;
+                }
+            } finally {
+                // TODO: shouldn't we record the open
+                // container so it can be later
+                // reused...?
+                // tis.setOpenContainer(zip);
+                try {
+                    zip.close();
+                } catch (IOException e) {
+                    // ignore
+                }
+            }
+        } catch (IOException e) {
+            // ignore
+        }
+        // Fallback: it's still a zip file, we just don't know what kind of one
+        return MediaType.APPLICATION_ZIP;
+    }
+
+    /**
+     * OpenDocument files, along with EPub files and ASiC ones, have a 
+     *  mimetype entry in the root of their Zip file. This entry contains
+     *  the mimetype of the overall file, stored as a single string.  
+     */
+    private static MediaType detectOpenDocument(ZipFile zip) {
+        try {
+            ZipArchiveEntry mimetype = zip.getEntry("mimetype");
+            if (mimetype != null) {
+                try (InputStream stream = zip.getInputStream(mimetype)) {
+                    return MediaType.parse(IOUtils.toString(stream, UTF_8));
+                }
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+        try {
+            if (zip.getEntry("_rels/.rels") != null
+                    || zip.getEntry("[Content_Types].xml") != null) {
+                // Use POI to open and investigate it for us
+                OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+                stream.setOpenContainer(pkg);
+
+                // Is at an OOXML format?
+                MediaType type = detectOfficeOpenXML(pkg);
+                if (type != null) return type;
+                
+                // Is it XPS format?
+                type = detectXPSOPC(pkg);
+                if (type != null) return type;
+                
+                // Is it an AutoCAD format?
+                type = detectAutoCADOPC(pkg);
+                if (type != null) return type;
+                
+                // We don't know what it is, sorry
+                return null;
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            return null;
+        } catch (RuntimeException e) {
+            return null;
+        } catch (InvalidFormatException e) {
+            return null;
+        }
+    }
+    /**
+     * Detects the type of an OfficeOpenXML (OOXML) file from
+     *  opened Package 
+     */
+    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+        // Check for the normal Office core document
+        PackageRelationshipCollection core = 
+               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+        // Otherwise check for some other Office core document types
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+        }
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+        }
+        
+        // If we didn't find a single core document of any type, skip detection
+        if (core.size() != 1) {
+            // Invalid OOXML Package received
+            return null;
+        }
+
+        // Get the type of the core document part
+        PackagePart corePart = pkg.getPart(core.getRelationship(0));
+        String coreType = corePart.getContentType();
+
+        // Turn that into the type of the overall document
+        String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+        // The Macro Enabled formats are a little special
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+            docType = docType.toLowerCase(Locale.ROOT) + ".12";
+        }
+
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+        }
+
+        // Build the MediaType object and return
+        return MediaType.parse(docType);
+    }
+    /**
+     * Detects Open XML Paper Specification (XPS)
+     */
+    private static MediaType detectXPSOPC(OPCPackage pkg) {
+        PackageRelationshipCollection xps = 
+                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+        if (xps.size() == 1) {
+            return MediaType.application("vnd.ms-xpsdocument");
+        } else {
+            // Non-XPS Package received
+            return null;
+        }
+    }
+    /**
+     * Detects AutoCAD formats that live in OPC packaging
+     */
+    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+        PackageRelationshipCollection dwfxSeq = 
+                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+        if (dwfxSeq.size() == 1) {
+            return MediaType.parse("model/vnd.dwfx+xps");
+        } else {
+            // Non-AutoCAD Package received
+            return null;
+        }
+    }
+
+    private static MediaType detectIWork(ZipFile zip) {
+        if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
+            // Locate the appropriate index file entry, and reads from that
+            // the root element of the document. That is used to the identify
+            // the correct type of the keynote container.
+            for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
+               IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip); 
+               if (type != null) {
+                  return type.getType();
+               }
+            }
+            
+            // Not sure, fallback to the container type
+            return MediaType.application("vnd.apple.iwork");
+        } else {
+            return null;
+        }
+    }
+    
+    private static MediaType detectJar(ZipFile zip) {
+       if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
+          // It's a Jar file, or something based on Jar
+          
+          // Is it an Android APK?
+          if (zip.getEntry("AndroidManifest.xml") != null) {
+             return MediaType.application("vnd.android.package-archive");
+          }
+          
+          // Check for WAR and EAR
+          if (zip.getEntry("WEB-INF/") != null) {
+             return MediaType.application("x-tika-java-web-archive");
+          }
+          if (zip.getEntry("META-INF/application.xml") != null) {
+             return MediaType.application("x-tika-java-enterprise-archive");
+          }
+          
+          // Looks like a regular Jar Archive
+          return MediaType.application("java-archive");
+       } else {
+          // Some Android APKs miss the default Manifest
+          if (zip.getEntry("AndroidManifest.xml") != null) {
+             return MediaType.application("vnd.android.package-archive");
+          }
+          
+          return null;
+       }
+    }
+
+    private static MediaType detectKmz(ZipFile zip) {
+        boolean kmlFound = false;
+
+        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+        while (entries.hasMoreElements()) {
+            ZipArchiveEntry entry = entries.nextElement();
+            String name = entry.getName();
+            if (!entry.isDirectory()
+                    && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
+                if (name.endsWith(".kml") && !kmlFound) {
+                    kmlFound = true;
+                } else {
+                    return null;
+                }
+            }
+        }
+
+        if (kmlFound) {
+            return MediaType.application("vnd.google-earth.kmz");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * To be considered as an IPA file, it needs to match all of these
+     */
+    private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+        private static final long serialVersionUID = 6545295886322115362L;
+        {
+           add(Pattern.compile("^Payload/$"));
+           add(Pattern.compile("^Payload/.*\\.app/$"));
+           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+           add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+           add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+    }};
+    @SuppressWarnings("unchecked")
+    private static MediaType detectIpa(ZipFile zip) {
+        // Note - consider generalising this logic, if another format needs many regexp matching
+        Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+        
+        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+        while (entries.hasMoreElements()) {
+            ZipArchiveEntry entry = entries.nextElement();
+            String name = entry.getName();
+            
+            Iterator<Pattern> ip = tmpPatterns.iterator();
+            while (ip.hasNext()) {
+                if (ip.next().matcher(name).matches()) {
+                    ip.remove();
+                }
+            }
+            if (tmpPatterns.isEmpty()) {
+                // We've found everything we need to find
+                return MediaType.application("x-itunes-ipa");
+            }
+        }
+        
+        // If we get here, not all required entries were found
+        return null;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector Sat Jan 16 18:23:01 2016
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.pkg.ZipContainerDetector

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,21 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.pkg.CompressorParser
+org.apache.tika.parser.pkg.PackageParser
+org.apache.tika.parser.pkg.RarParser
+org.apache.tika.parser.iwork.IWorkPackageParser
+

Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Test class for the <code>AutoPageNumberUtils</code> helper class.
+ */
+public class AutoPageNumberUtilsTest {
+
+	/**
+	 * Check upper-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaUpper() {
+		assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+		assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+		assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+		assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+		assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+		assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+	}
+
+	/**
+	 * Check lower-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaLower() {
+		assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+		assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+		assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+		assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+		assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+		assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+	}
+
+	/**
+	 * Check upper-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanUpper() {
+		assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+		assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+		assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+	}
+
+	/**
+	 * Check lower-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanLower() {
+		assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+		assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+		assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+	}
+
+}