You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:20 UTC
[14/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index d80842b..1b692bf 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -1,510 +1,510 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.RTFMetadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Junit test class for the Tika {@link RTFParser}
- */
-public class RTFParserTest extends TikaTest {
-
-    private Tika tika = new Tika();
-
-    @Test
-    public void testBasicExtraction() throws Exception {
-
-        XMLResult r = getXML("testRTF.rtf");
-        assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
-        assertContains("Test", r.xml);
-        assertContains("indexation Word", r.xml);
-    }
-
-    @Test
-    public void testUmlautSpacesExtraction2() throws Exception {
-        assertContains("<p>\u00DCbersicht</p>",
-                getXML("testRTFUmlautSpaces2.rtf").xml);
-    }
-
-    @Test
-    public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
-        XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
-
-        assertContains("\u5E74", r.xml);
-        assertContains("\u5ff5", r.xml);
-        assertContains("0 ", r.xml);
-        assertContains("abc", r.xml);
-        assertNotContained("\u5E74\u5E74", r.xml);
-    }
-
-    @Test
-    public void testHexEscapeInsideWord() throws Exception {
-        XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
-        assertContains("ESP\u00cdRITO", r.xml);
-    }
-
-    @Test
-    public void testWindowsCodepage1250() throws Exception {
-        XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
-        assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
-        assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
-    }
-
-    @Test
-    public void testTableCellSeparation() throws Exception {
-        String content = getXML("testRTFTableCellSeparation.rtf").xml;
-        content = content.replaceAll("(\\s|<\\/?p>)+", " ");
-        assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
-    }
-
-    @Test
-    public void testTableCellSeparation2() throws Exception {
-        String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
-        // TODO: why do we insert extra whitespace...?
-        assertContains("Station</p> <p>Fax", content);
-    }
-
-    @Test
-    public void testWordPadCzechCharactersExtraction() throws Exception {
-        XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
-        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
-        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
-    }
-
-    @Test
-    public void testWord2010CzechCharactersExtraction() throws Exception {
-        XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
-        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
-        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
-    }
-
-    @Test
-    public void testMS932Extraction() throws Exception {
-        XMLResult r = getXML("testRTF-ms932.rtf");
-        // Hello in Japanese
-        assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
-
-        // Verify title, since it was also encoded with MS932:
-        r = getXML("testRTF-ms932.rtf");
-        assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
-    }
-
-    @Test
-    public void testUmlautSpacesExtraction() throws Exception {
-        XMLResult r = getXML("testRTFUmlautSpaces.rtf");
-        assertContains("\u00DCbersicht", r.xml);
-    }
-
-    @Test
-    public void testGothic() throws Exception {
-        XMLResult r = getXML("testRTFUnicodeGothic.rtf");
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
-    }
-
-    @Test
-    public void testJapaneseText() throws Exception {
-        XMLResult r = getXML("testRTFJapanese.rtf");
-
-        // Verify title -- this title uses upr escape inside
-        // title info field:
-        assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
-                r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
-        assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
-
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
-
-        // 6 other characters
-        assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
-    }
-
-    @Test
-    public void testMaxLength() throws Exception {
-        Metadata metadata = new Metadata();
-        InputStream stream = TikaInputStream.get(
-                getTestDocumentAsStream("testRTFJapanese.rtf"));
-
-        // Test w/ default limit:
-        Tika localTika = new Tika();
-        String content = localTika.parseToString(stream, metadata);
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() > 500);
-
-        // Test setting max length on the instance:
-        localTika.setMaxStringLength(200);
-        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
-        content = localTika.parseToString(stream, metadata);
-
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() <= 200);
-
-        // Test setting max length per-call:
-        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
-        content = localTika.parseToString(stream, metadata, 100);
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() <= 100);
-    }
-
-    @Test
-    public void testTextWithCurlyBraces() throws Exception {
-        XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
-        assertContains("{ some text inside curly brackets }", r.xml);
-    }
-
-    @Test
-    public void testControls() throws Exception {
-        XMLResult r = getXML("testRTFControls.rtf");
-        String content = r.xml;
-        assertContains("Thiswordhasanem\u2014dash", content);
-        assertContains("Thiswordhasanen\u2013dash", content);
-        assertContains("Thiswordhasanon\u2011breakinghyphen", content);
-        assertContains("Thiswordhasanonbreaking\u00a0space", content);
-        assertContains("Thiswordhasanoptional\u00adhyphen", content);
-        assertContains("\u2018Single quoted text\u2019", content);
-        assertContains("\u201cDouble quoted text\u201d", content);
-        assertContains("\u201cDouble quoted text again\u201d", content);
-    }
-
-    @Test
-    public void testInvalidUnicode() throws Exception {
-        XMLResult r = getXML("testRTFInvalidUnicode.rtf");
-        String content = r.xml;
-        assertContains("Unpaired hi \ufffd here", content);
-        assertContains("Unpaired lo \ufffd here", content);
-        assertContains("Mismatched pair \ufffd\ufffd here", content);
-    }
-
-    @Test
-    public void testVarious() throws Exception {
-        XMLResult r = getXML("testRTFVarious.rtf");
-        String content = r.xml;
-        assertContains("Footnote appears here", content);
-        assertContains("This is a footnote.", content);
-        assertContains("This is the header text.", content);
-        assertContains("This is the footer text.", content);
-        assertContains("Here is a text box", content);
-        assertContains("Bold", content);
-        assertContains("italic", content);
-        assertContains("underline", content);
-        assertContains("superscript", content);
-        assertContains("subscript", content);
-        assertContains("Here is a citation:", content);
-        assertContains("Figure 1 This is a caption for Figure 1", content);
-        assertContains("(Kramer)", content);
-
-        // Table
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
-
-        // 2-columns
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
-        assertContains("This is a hyperlink", content);
-        assertContains("Here is a list:", content);
-        for (int row = 1; row <= 3; row++) {
-            assertContains("Bullet " + row, content);
-        }
-        assertContains("Here is a numbered list:", content);
-        for (int row = 1; row <= 3; row++) {
-            assertContains("Number bullet " + row, content);
-        }
-
-        for (int row = 1; row <= 2; row++) {
-            for (int col = 1; col <= 3; col++) {
-                assertContains("Row " + row + " Col " + col, content);
-            }
-        }
-
-        assertContains("Keyword1 Keyword2", content);
-        assertEquals("Keyword1 Keyword2",
-                r.metadata.get(TikaCoreProperties.KEYWORDS));
-
-        assertContains("Subject is here", content);
-        assertEquals("Subject is here",
-                r.metadata.get(OfficeOpenXMLCore.SUBJECT));
-        assertEquals("Subject is here",
-                r.metadata.get(Metadata.SUBJECT));
-
-        assertContains("Suddenly some Japanese text:", content);
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
-        // 6 other characters
-        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
-
-        assertContains("And then some Gothic text:", content);
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
-    }
-
-    @Test
-    public void testVariousStyle() throws Exception {
-        String content = getXML("testRTFVarious.rtf").xml;
-        assertContains("<b>Bold</b>", content);
-        assertContains("<i>italic</i>", content);
-    }
-
-    @Test
-    public void testBoldItalic() throws Exception {
-        String content = getXML("testRTFBoldItalic.rtf").xml;
-        assertContains("<b>bold</b>", content);
-        assertContains("<b>bold </b><b><i>italic</i></b>", content);
-        assertContains("<b><i>italic </i></b><b>bold</b>", content);
-        assertContains("<i>italic</i>", content);
-        assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
-        assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
-    }
-
-    @Test
-    public void testHyperlink() throws Exception {
-        String content = getXML("testRTFHyperlink.rtf").xml;
-        assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
-        assertEquals(-1, content.indexOf("<p>\t\t</p>"));
-    }
-
-    @Test
-    public void testIgnoredControlWord() throws Exception {
-        assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
-    }
-
-    @Test
-    public void testFontAfterBufferedText() throws Exception {
-        assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
-                getXML("testFontAfterBufferedText.rtf").xml);
-    }
-
-    @Test
-    public void testListMicrosoftWord() throws Exception {
-        String content = getXML("testRTFListMicrosoftWord.rtf").xml;
-        assertContains("<ol>\t<li>one</li>", content);
-        assertContains("</ol>", content);
-        assertContains("<ul>\t<li>first</li>", content);
-        assertContains("</ul>", content);
-    }
-
-    @Test
-    public void testListLibreOffice() throws Exception {
-        String content = getXML("testRTFListLibreOffice.rtf").xml;
-        assertContains("<ol>\t<li>one</li>", content);
-        assertContains("</ol>", content);
-        assertContains("<ul>\t<li>first</li>", content);
-        assertContains("</ul>", content);
-    }
-
-    // TIKA-782
-    @Test
-    public void testBinControlWord() throws Exception {
-        ByteCopyingHandler embHandler = new ByteCopyingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, embHandler);
-        }
-        assertEquals(1, embHandler.bytes.size());
-
-        byte[] bytes = embHandler.bytes.get(0);
-        assertEquals(10, bytes.length);
-        //}
-        assertEquals(125, (int) bytes[4]);
-        //make sure that at least the last value is correct
-        assertEquals(-1, (int) bytes[9]);
-    }
-
-    // TIKA-999
-    @Test
-    public void testMetaDataCounts() throws Exception {
-        XMLResult xml = getXML("test_embedded_package.rtf");
-        assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
-        assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
-        assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
-        assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
-    }
-
-    // TIKA-1192
-    @Test
-    public void testListOverride() throws Exception {
-        assertContains("Body", getXML("testRTFListOverride.rtf").xml);
-    }
-
-    // TIKA-1305
-    @Test
-    public void testCorruptListOverride() throws Exception {
-        assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
-    }
-
-    // TIKA-1010
-    @Test
-    public void testEmbeddedMonster() throws Exception {
-
-        Map<Integer, Pair> expected = new HashMap<>();
-        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
-        expected.put(3, new Pair("file_0.doc", "application/msword"));
-        expected.put(6, new Pair("file_1.xlsx",
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
-        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
-        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
-        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
-        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
-        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
-        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
-        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
-        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
-        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
-        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
-        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
-        expected.put(36, new Pair("file_6.doc", "application/msword"));
-        expected.put(39, new Pair("file_7.doc", "application/msword"));
-        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
-        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
-
-
-        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
-        assertEquals(48, metadataList.size());
-        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
-            Metadata metadata = metadataList.get(e.getKey());
-            Pair p = e.getValue();
-            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
-            //necessary to getName() because MSOffice extractor includes
-            //directory: _1457338524/HW.txt
-            assertEquals("filename equals ",
-                    p.fileName, FilenameUtils.getName(
-                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
-
-            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
-        }
-        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
-                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
-    }
-
-    //TIKA-1010 test regular (not "embedded") images/picts
-    @Test
-    public void testRegularImages() throws Exception {
-        Parser base = new AutoDetectParser();
-        ParseContext ctx = new ParseContext();
-        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
-        ctx.set(org.apache.tika.parser.Parser.class, parser);
-        ContentHandler handler = new BodyContentHandler();
-        Metadata rootMetadata = new Metadata();
-        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
-            parser.parse(tis, handler, rootMetadata, ctx);
-        }
-        List<Metadata> metadatas = parser.getMetadata();
-
-        Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
-        Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
-        assertTrue(meta_jpg_exif != null);
-        assertTrue(meta_jpg != null);
-        // had to comment these out (when moving from 1.x to 2.x
-        // because AutoDetectParser within this module does not include image parsing.
-
-//        assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
-//        assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
-        //make sure old metadata doesn't linger between objects
-//        assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
-        assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
-        assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
-
-        assertEquals(25, meta_jpg.names().length);
-        assertEquals(25, meta_jpg_exif.names().length);
-    }
-
-    @Test
-    public void testMultipleNewlines() throws Exception {
-        String content = getXML("testRTFNewlines.rtf").xml;
-        content = content.replaceAll("[\r\n]+", " ");
-        assertContains("<body><p>one</p> " +
-                "<p /> " +
-                "<p>two</p> " +
-                "<p /> " +
-                "<p /> " +
-                "<p>three</p> " +
-                "<p /> " +
-                "<p /> " +
-                "<p /> " +
-                "<p>four</p>", content);
-    }
-
-    //TIKA-1010 test linked embedded doc
-    @Test
-    public void testEmbeddedLinkedDocument() throws Exception {
-        Set<MediaType> skipTypes = new HashSet<MediaType>();
-        skipTypes.add(MediaType.parse("application/x-emf"));
-        skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-        TrackingHandler tracker = new TrackingHandler(skipTypes);
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
-        //should gracefully skip link and not throw NPE, IOEx, etc
-        assertEquals(0, tracker.filenames.size());
-
-        tracker = new TrackingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
-        //should gracefully skip link and not throw NPE, IOEx, etc
-        assertEquals(2, tracker.filenames.size());
-    }
-
-    private static class Pair {
-        final String fileName;
-        final String mimeType;
-        Pair(String fileName, String mimeType) {
-            this.fileName = fileName;
-            this.mimeType = mimeType;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Junit test class for the Tika {@link RTFParser}
+ */
+public class RTFParserTest extends TikaTest {
+
+    private Tika tika = new Tika();
+
+    @Test
+    public void testBasicExtraction() throws Exception {
+
+        XMLResult r = getXML("testRTF.rtf");
+        assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("Test", r.xml);
+        assertContains("indexation Word", r.xml);
+    }
+
+    @Test
+    public void testUmlautSpacesExtraction2() throws Exception {
+        assertContains("<p>\u00DCbersicht</p>",
+                getXML("testRTFUmlautSpaces2.rtf").xml);
+    }
+
+    @Test
+    public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
+        XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+
+        assertContains("\u5E74", r.xml);
+        assertContains("\u5ff5", r.xml);
+        assertContains("0 ", r.xml);
+        assertContains("abc", r.xml);
+        assertNotContained("\u5E74\u5E74", r.xml);
+    }
+
+    @Test
+    public void testHexEscapeInsideWord() throws Exception {
+        XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
+        assertContains("ESP\u00cdRITO", r.xml);
+    }
+
+    @Test
+    public void testWindowsCodepage1250() throws Exception {
+        XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
+        assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
+        assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
+    }
+
+    @Test
+    public void testTableCellSeparation() throws Exception {
+        String content = getXML("testRTFTableCellSeparation.rtf").xml;
+        content = content.replaceAll("(\\s|<\\/?p>)+", " ");
+        assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+    }
+
+    @Test
+    public void testTableCellSeparation2() throws Exception {
+        String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
+        // TODO: why do we insert extra whitespace...?
+        assertContains("Station</p> <p>Fax", content);
+    }
+
+    @Test
+    public void testWordPadCzechCharactersExtraction() throws Exception {
+        XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+    }
+
+    @Test
+    public void testWord2010CzechCharactersExtraction() throws Exception {
+        XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+    }
+
+    @Test
+    public void testMS932Extraction() throws Exception {
+        XMLResult r = getXML("testRTF-ms932.rtf");
+        // Hello in Japanese
+        assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
+
+        // Verify title, since it was also encoded with MS932:
+        r = getXML("testRTF-ms932.rtf");
+        assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testUmlautSpacesExtraction() throws Exception {
+        XMLResult r = getXML("testRTFUmlautSpaces.rtf");
+        assertContains("\u00DCbersicht", r.xml);
+    }
+
+    @Test
+    public void testGothic() throws Exception {
+        XMLResult r = getXML("testRTFUnicodeGothic.rtf");
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
+    }
+
+    @Test
+    public void testJapaneseText() throws Exception {
+        XMLResult r = getXML("testRTFJapanese.rtf");
+
+        // Verify title -- this title uses upr escape inside
+        // title info field:
+        assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
+                r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
+        assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
+
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
+
+        // 6 other characters
+        assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
+    }
+
+    @Test
+    public void testMaxLength() throws Exception {
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(
+                getTestDocumentAsStream("testRTFJapanese.rtf"));
+
+        // Test w/ default limit:
+        Tika localTika = new Tika();
+        String content = localTika.parseToString(stream, metadata);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() > 500);
+
+        // Test setting max length on the instance:
+        localTika.setMaxStringLength(200);
+        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+        content = localTika.parseToString(stream, metadata);
+
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 200);
+
+        // Test setting max length per-call:
+        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+        content = localTika.parseToString(stream, metadata, 100);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 100);
+    }
+
+    @Test
+    public void testTextWithCurlyBraces() throws Exception {
+        XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
+        assertContains("{ some text inside curly brackets }", r.xml);
+    }
+
+    @Test
+    public void testControls() throws Exception {
+        XMLResult r = getXML("testRTFControls.rtf");
+        String content = r.xml;
+        assertContains("Thiswordhasanem\u2014dash", content);
+        assertContains("Thiswordhasanen\u2013dash", content);
+        assertContains("Thiswordhasanon\u2011breakinghyphen", content);
+        assertContains("Thiswordhasanonbreaking\u00a0space", content);
+        assertContains("Thiswordhasanoptional\u00adhyphen", content);
+        assertContains("\u2018Single quoted text\u2019", content);
+        assertContains("\u201cDouble quoted text\u201d", content);
+        assertContains("\u201cDouble quoted text again\u201d", content);
+    }
+
+    @Test
+    public void testInvalidUnicode() throws Exception {
+        XMLResult r = getXML("testRTFInvalidUnicode.rtf");
+        String content = r.xml;
+        assertContains("Unpaired hi \ufffd here", content);
+        assertContains("Unpaired lo \ufffd here", content);
+        assertContains("Mismatched pair \ufffd\ufffd here", content);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        XMLResult r = getXML("testRTFVarious.rtf");
+        String content = r.xml;
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+
+        // Table
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
+
+        // 2-columns
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for (int row = 1; row <= 3; row++) {
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for (int row = 1; row <= 3; row++) {
+            assertContains("Number bullet " + row, content);
+        }
+
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                r.metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                r.metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Subject is here",
+                r.metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    @Test
+    public void testVariousStyle() throws Exception {
+        String content = getXML("testRTFVarious.rtf").xml;
+        assertContains("<b>Bold</b>", content);
+        assertContains("<i>italic</i>", content);
+    }
+
+    @Test
+    public void testBoldItalic() throws Exception {
+        String content = getXML("testRTFBoldItalic.rtf").xml;
+        assertContains("<b>bold</b>", content);
+        assertContains("<b>bold </b><b><i>italic</i></b>", content);
+        assertContains("<b><i>italic </i></b><b>bold</b>", content);
+        assertContains("<i>italic</i>", content);
+        assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
+        assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
+    }
+
+    @Test
+    public void testHyperlink() throws Exception {
+        String content = getXML("testRTFHyperlink.rtf").xml;
+        assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
+        assertEquals(-1, content.indexOf("<p>\t\t</p>"));
+    }
+
+    @Test
+    public void testIgnoredControlWord() throws Exception {
+        assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
+    }
+
+    @Test
+    public void testFontAfterBufferedText() throws Exception {
+        assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
+                getXML("testFontAfterBufferedText.rtf").xml);
+    }
+
+    @Test
+    public void testListMicrosoftWord() throws Exception {
+        String content = getXML("testRTFListMicrosoftWord.rtf").xml;
+        assertContains("<ol>\t<li>one</li>", content);
+        assertContains("</ol>", content);
+        assertContains("<ul>\t<li>first</li>", content);
+        assertContains("</ul>", content);
+    }
+
+    @Test
+    public void testListLibreOffice() throws Exception {
+        String content = getXML("testRTFListLibreOffice.rtf").xml;
+        assertContains("<ol>\t<li>one</li>", content);
+        assertContains("</ol>", content);
+        assertContains("<ul>\t<li>first</li>", content);
+        assertContains("</ul>", content);
+    }
+
+    // TIKA-782
+    @Test
+    public void testBinControlWord() throws Exception {
+        ByteCopyingHandler embHandler = new ByteCopyingHandler();
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, embHandler);
+        }
+        assertEquals(1, embHandler.bytes.size());
+
+        byte[] bytes = embHandler.bytes.get(0);
+        assertEquals(10, bytes.length);
+        //}
+        assertEquals(125, (int) bytes[4]);
+        //make sure that at least the last value is correct
+        assertEquals(-1, (int) bytes[9]);
+    }
+
+    // TIKA-999
+    @Test
+    public void testMetaDataCounts() throws Exception {
+        XMLResult xml = getXML("test_embedded_package.rtf");
+        assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
+        assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
+        assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
+        assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
+    }
+
+    // TIKA-1192
+    @Test
+    public void testListOverride() throws Exception {
+        assertContains("Body", getXML("testRTFListOverride.rtf").xml);
+    }
+
+    // TIKA-1305
+    @Test
+    public void testCorruptListOverride() throws Exception {
+        assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
+    }
+
+    // TIKA-1010
+    @Test
+    public void testEmbeddedMonster() throws Exception {
+
+        Map<Integer, Pair> expected = new HashMap<>();
+        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+        expected.put(3, new Pair("file_0.doc", "application/msword"));
+        expected.put(6, new Pair("file_1.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+        expected.put(36, new Pair("file_6.doc", "application/msword"));
+        expected.put(39, new Pair("file_7.doc", "application/msword"));
+        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+        assertEquals(48, metadataList.size());
+        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+            Metadata metadata = metadataList.get(e.getKey());
+            Pair p = e.getValue();
+            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+            //necessary to getName() because MSOffice extractor includes
+            //directory: _1457338524/HW.txt
+            assertEquals("filename equals ",
+                    p.fileName, FilenameUtils.getName(
+                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
+        }
+        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+    }
+
+    //TIKA-1010 test regular (not "embedded") images/picts
+    @Test
+    public void testRegularImages() throws Exception {
+        Parser base = new AutoDetectParser();
+        ParseContext ctx = new ParseContext();
+        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ctx.set(org.apache.tika.parser.Parser.class, parser);
+        ContentHandler handler = new BodyContentHandler();
+        Metadata rootMetadata = new Metadata();
+        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
+            parser.parse(tis, handler, rootMetadata, ctx);
+        }
+        List<Metadata> metadatas = parser.getMetadata();
+
+        Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
+        Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+
+        assertTrue(meta_jpg_exif != null);
+        assertTrue(meta_jpg != null);
+        // had to comment these out (when moving from 1.x to 2.x
+        // because AutoDetectParser within this module does not include image parsing.
+
+//        assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
+//        assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
+        //make sure old metadata doesn't linger between objects
+//        assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
+        assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
+        assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
+
+        assertEquals(25, meta_jpg.names().length);
+        assertEquals(25, meta_jpg_exif.names().length);
+    }
+
+    @Test
+    public void testMultipleNewlines() throws Exception {
+        String content = getXML("testRTFNewlines.rtf").xml;
+        content = content.replaceAll("[\r\n]+", " ");
+        assertContains("<body><p>one</p> " +
+                "<p /> " +
+                "<p>two</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>three</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>four</p>", content);
+    }
+
+    //TIKA-1010 test linked embedded doc
+    @Test
+    public void testEmbeddedLinkedDocument() throws Exception {
+        Set<MediaType> skipTypes = new HashSet<MediaType>();
+        skipTypes.add(MediaType.parse("application/x-emf"));
+        skipTypes.add(MediaType.parse("application/x-msmetafile"));
+
+        TrackingHandler tracker = new TrackingHandler(skipTypes);
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(0, tracker.filenames.size());
+
+        tracker = new TrackingHandler();
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(2, tracker.filenames.size());
+    }
+
+    private static class Pair {
+        final String fileName;
+        final String mimeType;
+        Pair(String fileName, String mimeType) {
+            this.fileName = fileName;
+            this.mimeType = mimeType;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml
index 8d1238d..2feb22b 100644
--- a/tika-parser-modules/tika-parser-package-module/pom.xml
+++ b/tika-parser-modules/tika-parser-package-module/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-package-module</artifactId>
-  <name>Apache Tika parser package module</name>
-  <url>http://tika.apache.org/</url>
-
-  <properties>
-    <!-- NOTE: sync tukaani version with commons-compress -->
-    <tukaani.version>1.5</tukaani.version>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.tukaani</groupId>
-      <artifactId>xz</artifactId>
-      <version>${tukaani.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.github.junrar</groupId>
-      <artifactId>junrar</artifactId>
-      <version>0.7</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-compress</artifactId>
-      <version>${commons.compress.version}</version>
-    </dependency>
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-package-module</artifactId>
+  <name>Apache Tika parser package module</name>
+  <url>http://tika.apache.org/</url>
+
+  <properties>
+    <!-- NOTE: sync tukaani version with commons-compress -->
+    <tukaani.version>1.5</tukaani.version>
+  </properties>
+
+  <dependencies>
     <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.tukaani</groupId>
+      <artifactId>xz</artifactId>
+      <version>${tukaani.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.github.junrar</groupId>
+      <artifactId>junrar</artifactId>
+      <version>0.7</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>${commons.compress.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>${codec.version}</version>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
     </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
index 0fb71fa..2345029 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pkg.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pkg.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
index 32f0126..4143932 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
@@ -1,112 +1,112 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.util.Locale;
-
-/**
- * Utility class to allow for conversion from an integer to Roman numerals
- * or alpha-numeric symbols in line with Pages auto numbering formats.
- */
- class AutoPageNumberUtils {
-	
-	private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
-		"H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
-		"U", "V", "W", "X", "Y", "Z" };
-	
-	private static final int MAX = 26; 
-
-	public static String asAlphaNumeric(int i) {
-		StringBuffer sbuff = new StringBuffer();
-		int index = i % MAX;
-		int ratio = i / MAX;
-		
-		if (index == 0) {
-			ratio--;
-			index = MAX;
-		}
-		
-		for(int j = 0; j <= ratio; j++) {
-			sbuff.append(ALPHABET[index - 1]);		}
-		return sbuff.toString();
-	}
-	
-	public static String asAlphaNumericLower(int i) {
-		return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
-	}
-	
-	/*
-	 * Code copied from jena.apache.org.
-	 * @see com.hp.hpl.jena.sparql.util.RomanNumeral
-	 */
-    public static String asRomanNumerals(int i) {
-        if ( i <= 0 )
-            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
-        if ( i > 3999 )
-            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
-        StringBuffer sbuff = new StringBuffer() ;
-        
-        i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
-        i = i2r(sbuff, i, "C", 100,  "XC", 90,  "L", 50,  "XL", 40 ) ;
-        i = i2r(sbuff, i, "X", 10,   "IX", 9,   "V", 5,   "IV", 4) ;
-        
-        while ( i >= 1 )
-        {
-            sbuff.append("I") ;
-            i -= 1 ;
-        }
-        return sbuff.toString() ;
-            
-        
-    }
-    
-	public static String asRomanNumeralsLower(int i) {
-		return asRomanNumerals(i).toLowerCase(Locale.ROOT);
-	}
-    
-    private static int i2r(StringBuffer sbuff, int i,
-                           String tens,  int iTens, 
-                           String nines, int iNines,
-                           String fives, int iFives,
-                           String fours, int iFours)
-    {
-        while ( i >= iTens )
-        {
-            sbuff.append(tens) ;
-            i -= iTens ;
-        }
-        
-        if ( i >= iNines )
-        {
-            sbuff.append(nines) ;
-            i -= iNines;
-        }
-
-        if ( i >= iFives )
-        {
-            sbuff.append(fives) ;
-            i -= iFives ;
-        }
-        if ( i >= iFours )
-        {
-            sbuff.append(fours) ;
-            i -= iFours ;
-        }
-        return i ;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.util.Locale;
+
+/**
+ * Utility class to allow for conversion from an integer to Roman numerals
+ * or alpha-numeric symbols in line with Pages auto numbering formats.
+ */
+ class AutoPageNumberUtils {
+	
+	private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
+		"H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
+		"U", "V", "W", "X", "Y", "Z" };
+	
+	private static final int MAX = 26; 
+
+	public static String asAlphaNumeric(int i) {
+		StringBuffer sbuff = new StringBuffer();
+		int index = i % MAX;
+		int ratio = i / MAX;
+		
+		if (index == 0) {
+			ratio--;
+			index = MAX;
+		}
+		
+		for(int j = 0; j <= ratio; j++) {
+			sbuff.append(ALPHABET[index - 1]);		}
+		return sbuff.toString();
+	}
+	
+	public static String asAlphaNumericLower(int i) {
+		return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
+	}
+	
+	/*
+	 * Code copied from jena.apache.org.
+	 * @see com.hp.hpl.jena.sparql.util.RomanNumeral
+	 */
+    public static String asRomanNumerals(int i) {
+        if ( i <= 0 )
+            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+        if ( i > 3999 )
+            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+        StringBuffer sbuff = new StringBuffer() ;
+        
+        i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
+        i = i2r(sbuff, i, "C", 100,  "XC", 90,  "L", 50,  "XL", 40 ) ;
+        i = i2r(sbuff, i, "X", 10,   "IX", 9,   "V", 5,   "IV", 4) ;
+        
+        while ( i >= 1 )
+        {
+            sbuff.append("I") ;
+            i -= 1 ;
+        }
+        return sbuff.toString() ;
+            
+        
+    }
+    
+	public static String asRomanNumeralsLower(int i) {
+		return asRomanNumerals(i).toLowerCase(Locale.ROOT);
+	}
+    
+    private static int i2r(StringBuffer sbuff, int i,
+                           String tens,  int iTens, 
+                           String nines, int iNines,
+                           String fives, int iFives,
+                           String fours, int iFours)
+    {
+        while ( i >= iTens )
+        {
+            sbuff.append(tens) ;
+            i -= iTens ;
+        }
+        
+        if ( i >= iNines )
+        {
+            sbuff.append(nines) ;
+            i -= iNines;
+        }
+
+        if ( i >= iFives )
+        {
+            sbuff.append(fives) ;
+            i -= iFives ;
+        }
+        if ( i >= iFours )
+        {
+            sbuff.append(fours) ;
+            i -= iFours ;
+        }
+        return i ;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 1861931..79d82e8 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -1,219 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.xml.namespace.QName;
-
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.detect.XmlRootExtractor;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
- * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
- * 
- * Currently supported formats:
- * <ol>
- * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
- * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
- * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
- * </ol>
- */
-public class IWorkPackageParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -2160322853809682372L;
-
-    /**
-     * Which files within an iWork file contain the actual content?
-     */
-    public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
-            new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
-    );
-    /**
-     * All iWork files contain one of these, so we can detect based on it
-     */
-    public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
-    
-    public enum IWORKDocumentType {
-       KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
-       NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
-       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
-       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
-       
-       private final String namespace;
-       private final String part;
-       private final MediaType type;
-       
-       IWORKDocumentType(String namespace, String part, MediaType type) {
-          this.namespace = namespace;
-          this.part = part;
-          this.type = type;
-       }
-       
-       public String getNamespace() {
-          return namespace;
-       }
-
-       public String getPart() {
-          return part;
-       }
-
-       public MediaType getType() {
-          return type;
-       }
-
-       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
-          try {
-             if (entry == null) {
-                 return null;
-             }
-
-              try (InputStream stream = zip.getInputStream(entry)) {
-                  return detectType(stream);
-              }
-          } catch (IOException e) {
-             return null;
-          }
-       }
-       
-       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
-          if (entry == null) {
-              return null;
-          }
-
-          return detectType(zip);
-       }
-       
-       private static IWORKDocumentType detectType(InputStream stream) {
-          QName qname = new XmlRootExtractor().extractRootElement(stream);
-          if (qname != null) {
-             String uri = qname.getNamespaceURI();
-             String local = qname.getLocalPart();
-            
-             for (IWORKDocumentType type : values()) {
-                if(type.getNamespace().equals(uri) && 
-                   type.getPart().equals(local)) {
-                   return type;
-                }
-             }
-          } else {
-             // There was a problem with extracting the root type
-             // Password Protected iWorks files are funny, but we can usually
-             //  spot them because they encrypt part of the zip stream 
-             try {
-                stream.read();
-             } catch(UnsupportedZipFeatureException e) {
-                // Compression field was likely encrypted
-                return ENCRYPTED;
-             } catch(Exception ignored) {
-             }
-          }
-          return null;
-       }
-    }
-
-    /**
-     * This parser handles all iWorks formats.
-     */
-    private final static Set<MediaType> supportedTypes =
-         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.application("vnd.apple.iwork"),
-                IWORKDocumentType.KEYNOTE.getType(),
-                IWORKDocumentType.NUMBERS.getType(),
-                IWORKDocumentType.PAGES.getType()
-         )));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return supportedTypes;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
-        ZipArchiveEntry entry = zip.getNextZipEntry();
-
-        while (entry != null) {
-            if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
-                entry = zip.getNextZipEntry();
-                continue;
-            }
-
-            InputStream entryStream = new BufferedInputStream(zip, 4096);
-            entryStream.mark(4096);
-            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
-            entryStream.reset();
-            
-            if(type != null) {
-               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-               ContentHandler contentHandler;
-               
-               switch(type) {
-               case KEYNOTE:
-                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
-                  break;
-               case NUMBERS:
-                  contentHandler = new NumbersContentHandler(xhtml, metadata);
-                  break;
-               case PAGES:
-                  contentHandler = new PagesContentHandler(xhtml, metadata);
-                  break;
-               case ENCRYPTED:
-                   // We can't do anything for the file right now
-                   contentHandler = null;
-                   break;
-               default:
-                  throw new TikaException("Unhandled iWorks file " + type);
-               }
-
-               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
-               xhtml.startDocument();
-               if (contentHandler != null) {
-                  context.getSAXParser().parse(
-                          new CloseShieldInputStream(entryStream),
-                          new OfflineContentHandler(contentHandler)
-                  );
-               }
-               xhtml.endDocument();
-            }
-            
-            entry = zip.getNextZipEntry();
-        }
-        // Don't close the zip InputStream (TIKA-1117).
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.namespace.QName;
+
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.XmlRootExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
+ * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
+ * 
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
+ * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
+ * </ol>
+ */
+public class IWorkPackageParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -2160322853809682372L;
+
+    /**
+     * Which files within an iWork file contain the actual content?
+     */
+    public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
+            new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
+    );
+    /**
+     * All iWork files contain one of these, so we can detect based on it
+     */
+    public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
+    
+    public enum IWORKDocumentType {
+       KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
+       NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
+       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
+       
+       private final String namespace;
+       private final String part;
+       private final MediaType type;
+       
+       IWORKDocumentType(String namespace, String part, MediaType type) {
+          this.namespace = namespace;
+          this.part = part;
+          this.type = type;
+       }
+       
+       public String getNamespace() {
+          return namespace;
+       }
+
+       public String getPart() {
+          return part;
+       }
+
+       public MediaType getType() {
+          return type;
+       }
+
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
+          try {
+             if (entry == null) {
+                 return null;
+             }
+
+              try (InputStream stream = zip.getInputStream(entry)) {
+                  return detectType(stream);
+              }
+          } catch (IOException e) {
+             return null;
+          }
+       }
+       
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
+          if (entry == null) {
+              return null;
+          }
+
+          return detectType(zip);
+       }
+       
+       private static IWORKDocumentType detectType(InputStream stream) {
+          QName qname = new XmlRootExtractor().extractRootElement(stream);
+          if (qname != null) {
+             String uri = qname.getNamespaceURI();
+             String local = qname.getLocalPart();
+            
+             for (IWORKDocumentType type : values()) {
+                if(type.getNamespace().equals(uri) && 
+                   type.getPart().equals(local)) {
+                   return type;
+                }
+             }
+          } else {
+             // There was a problem with extracting the root type
+             // Password Protected iWorks files are funny, but we can usually
+             //  spot them because they encrypt part of the zip stream 
+             try {
+                stream.read();
+             } catch(UnsupportedZipFeatureException e) {
+                // Compression field was likely encrypted
+                return ENCRYPTED;
+             } catch(Exception ignored) {
+             }
+          }
+          return null;
+       }
+    }
+
+    /**
+     * This parser handles all iWorks formats.
+     */
+    private final static Set<MediaType> supportedTypes =
+         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("vnd.apple.iwork"),
+                IWORKDocumentType.KEYNOTE.getType(),
+                IWORKDocumentType.NUMBERS.getType(),
+                IWORKDocumentType.PAGES.getType()
+         )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveEntry entry = zip.getNextZipEntry();
+
+        while (entry != null) {
+            if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
+                entry = zip.getNextZipEntry();
+                continue;
+            }
+
+            InputStream entryStream = new BufferedInputStream(zip, 4096);
+            entryStream.mark(4096);
+            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            entryStream.reset();
+            
+            if(type != null) {
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+               ContentHandler contentHandler;
+               
+               switch(type) {
+               case KEYNOTE:
+                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
+                  break;
+               case NUMBERS:
+                  contentHandler = new NumbersContentHandler(xhtml, metadata);
+                  break;
+               case PAGES:
+                  contentHandler = new PagesContentHandler(xhtml, metadata);
+                  break;
+               case ENCRYPTED:
+                   // We can't do anything for the file right now
+                   contentHandler = null;
+                   break;
+               default:
+                  throw new TikaException("Unhandled iWorks file " + type);
+               }
+
+               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
+               xhtml.startDocument();
+               if (contentHandler != null) {
+                  context.getSAXParser().parse(
+                          new CloseShieldInputStream(entryStream),
+                          new OfflineContentHandler(contentHandler)
+                  );
+               }
+               xhtml.endDocument();
+            }
+            
+            entry = zip.getNextZipEntry();
+        }
+        // Don't close the zip InputStream (TIKA-1117).
+    }
+
+}