You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:07 UTC

[01/39] tika git commit: Convert new lines from windows to unix

Repository: tika
Updated Branches:
  refs/heads/2.x dd3c2a486 -> c7a6bcac4


http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index d1e1463..ee9a98b 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -1,485 +1,485 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.*;
-import static org.junit.Assume.assumeTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-
-import org.apache.james.mime4j.stream.MimeConfig;
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class RFC822ParserTest extends TikaTest {
-
-    private static InputStream getStream(String name) {
-        InputStream stream = Thread.currentThread().getContextClassLoader()
-                .getResourceAsStream(name);
-        assertNotNull("Test file not found " + name, stream);
-        return stream;
-    }
-
-    @Test
-    public void testSimple() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822");
-        ContentHandler handler = mock(DefaultHandler.class);
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            verify(handler).startDocument();
-            //just one body
-            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
-            verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
-            //no multi-part body parts
-            verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
-            verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
-            verify(handler).endDocument();
-            //note no leading spaces, and no quotes
-            assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
-                    metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
-                    metadata.get(Metadata.SUBJECT));
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-
-    @Test
-    public void testMultipart() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822-multipart");
-        ContentHandler handler = mock(XHTMLContentHandler.class);
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            verify(handler).startDocument();
-            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
-            verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
-            verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
-            verify(handler).endDocument();
-
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-
-        //repeat, this time looking at content
-        parser = new RFC822Parser();
-        metadata = new Metadata();
-        stream = getStream("test-documents/testRFC822-multipart");
-        handler = new BodyContentHandler();
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
-            String bodyText = handler.toString();
-            assertTrue(bodyText.contains("body 1"));
-            assertTrue(bodyText.contains("body 2"));
-            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-
-    @Test
-    public void testQuotedPrintable() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822_quoted");
-        ContentHandler handler = new BodyContentHandler();
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
-            String bodyText = handler.toString();
-            assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
-            assertTrue(bodyText.contains("Lines can be split like this."));
-            assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
-            assertFalse(bodyText.contains("=")); //there should be no escape sequences
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-
-    @Test
-    public void testBase64() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822_base64");
-        ContentHandler handler = new BodyContentHandler();
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
-            assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString());
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-
-    @Test
-    public void testI18NHeaders() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
-        ContentHandler handler = mock(DefaultHandler.class);
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of internationalized headers, both
-            //quoted-printable (Q) and Base64 (B).
-            assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
-                    metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("If you can read this you understand the example.",
-                    metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("If you can read this you understand the example.",
-                    metadata.get(Metadata.SUBJECT));
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-
-    /**
-     * The from isn't in the usual form.
-     * See TIKA-618
-     */
-    @Test
-    public void testUnusualFromAddress() throws Exception {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822_oddfrom");
-        ContentHandler handler = mock(DefaultHandler.class);
-
-        parser.parse(stream, handler, metadata, new ParseContext());
-        assertEquals("Saved by Windows Internet Explorer 7",
-                metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
-                metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
-                metadata.get(Metadata.SUBJECT));
-    }
-
-    /**
-     * Test for TIKA-640, increase header max beyond 10k bytes
-     */
-    @Test
-    public void testLongHeader() throws Exception {
-        StringBuilder inputBuilder = new StringBuilder();
-        for (int i = 0; i < 2000; ++i) {
-            inputBuilder.append( //len > 50
-                    "really really really really really really long name ");
-        }
-        String name = inputBuilder.toString();
-        byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII);
-
-        Parser parser = new RFC822Parser();
-        ContentHandler handler = new DefaultHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-
-        try {
-            parser.parse(
-                    new ByteArrayInputStream(data), handler, metadata, context);
-            fail();
-        } catch (TikaException expected) {
-        }
-
-        MimeConfig config = new MimeConfig();
-        config.setMaxHeaderLen(-1);
-        config.setMaxLineLen(-1);
-        context.set(MimeConfig.class, config);
-        parser.parse(
-                new ByteArrayInputStream(data), handler, metadata, context);
-        assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
-    }
-
-    /**
-     * Test for TIKA-678 - not all headers may be present
-     */
-    @Test
-    public void testSomeMissingHeaders() throws Exception {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
-        ContentHandler handler = new BodyContentHandler();
-
-        parser.parse(stream, handler, metadata, new ParseContext());
-        assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
-        assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
-        assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
-        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
-        assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
-        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
-        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
-        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
-        assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
-        assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("abcd", metadata.get(Metadata.SUBJECT));
-        assertContains("bar biz bat", handler.toString());
-    }
-
-    /**
-     * Test TIKA-1028 - If the mail contains an encrypted attachment (or
-     * an attachment that others triggers an error), parsing should carry
-     * on for the remainder regardless
-     */
-    @Test
-    public void testEncryptedZipAttachment() throws Exception {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-        InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
-        ContentHandler handler = new BodyContentHandler();
-        parser.parse(stream, handler, metadata, context);
-
-        // Check we go the metadata
-        assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
-        assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
-        // Check we got the message text, for both Plain Text and HTML
-        assertContains("Includes encrypted zip file", handler.toString());
-        assertContains("password is \"test\".", handler.toString());
-        assertContains("This is the Plain Text part", handler.toString());
-        assertContains("This is the HTML part", handler.toString());
-
-        // We won't get the contents of the zip file, but we will get the name
-        assertContains("text.txt", handler.toString());
-        assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
-
-        // Try again, this time with the password supplied
-        // Check that we also get the zip's contents as well
-        context.set(PasswordProvider.class, new PasswordProvider() {
-            public String getPassword(Metadata metadata) {
-                return "test";
-            }
-        });
-        stream = getStream("test-documents/testRFC822_encrypted_zip");
-        handler = new BodyContentHandler();
-        parser.parse(stream, handler, metadata, context);
-
-        assertContains("Includes encrypted zip file", handler.toString());
-        assertContains("password is \"test\".", handler.toString());
-        assertContains("This is the Plain Text part", handler.toString());
-        assertContains("This is the HTML part", handler.toString());
-
-        // We do get the name of the file in the encrypted zip file
-        assertContains("text.txt", handler.toString());
-
-        // TODO Upgrade to a version of Commons Compress with Encryption
-        //  support, then verify we get the contents of the text file
-        //  held within the encrypted zip
-        assumeTrue(false); // No Zip Encryption support yet
-        assertContains("TEST DATA FOR TIKA.", handler.toString());
-        assertContains("ENCRYPTED ZIP FILES", handler.toString());
-        assertContains("TIKA-1028", handler.toString());
-    }
-
-    /**
-     * Test TIKA-1028 - Ensure we can get the contents of an
-     * un-encrypted zip file
-     */
-    @Test
-    public void testNormalZipAttachment() throws Exception {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-        InputStream stream = getStream("test-documents/testRFC822_normal_zip");
-        ContentHandler handler = new BodyContentHandler();
-        parser.parse(stream, handler, metadata, context);
-
-        // Check we go the metadata
-        assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
-        assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
-        // Check we got the message text, for both Plain Text and HTML
-        assertContains("Includes a normal, unencrypted zip file", handler.toString());
-        assertContains("This is the Plain Text part", handler.toString());
-        assertContains("This is the HTML part", handler.toString());
-
-        // We get both name and contents of the zip file's contents
-        assertContains("text.txt", handler.toString());
-        assertContains("TEST DATA FOR TIKA.", handler.toString());
-        assertContains("This is text inside an unencrypted zip file", handler.toString());
-        assertContains("TIKA-1028", handler.toString());
-    }
-
-    /**
-     * TIKA-1222 When requested, ensure that the various attachments of
-     * the mail come through properly as embedded resources
-     */
-    @Test
-    public void testGetAttachmentsAsEmbeddedResources() throws Exception {
-        TrackingHandler tracker = new TrackingHandler();
-        ContainerExtractor ex = new ParserContainerExtractor();
-        try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
-
-        // Check we found all 3 parts
-        assertEquals(3, tracker.filenames.size());
-        assertEquals(3, tracker.mediaTypes.size());
-
-        // No filenames available
-        assertEquals(null, tracker.filenames.get(0));
-        assertEquals(null, tracker.filenames.get(1));
-        assertEquals(null, tracker.filenames.get(2));
-        // Types are available
-        assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
-        assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
-        assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
-    }
-
-    @Test
-    public void testDetection() throws Exception {
-        //test simple text file
-        XMLResult r = getXML("testRFC822_date_utf8");
-        assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
-
-        //test without extension
-        r = getXML("testRFC822_eml");
-        assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testDates() throws Exception {
-        //tests non-standard dates that mime4j can't parse
-        XMLResult r = getXML("testRFC822_date_utf8");
-        assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
-
-        r = getXML("testRFC822_eml");
-        assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
-
-
-        String expected = "2016-05-15T01:32:00Z";
-
-        for (String dateString : new String[]{
-                "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works
-                "Sun, 15 May 2016 01:32:00", //no timezone
-                "Sunday, May 15 2016 1:32 AM",
-                "May 15 2016 1:32am",
-                "May 15 2016 1:32 am",
-                "2016-05-15 01:32:00",
-                "      Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace
-                "      Sun, 14 May 2016 20:32:00 EST",
-        }) {
-            testDate(dateString, expected);
-        }
-
-        //now try days without times
-        expected = "2016-05-15T12:00:00Z";
-        for (String dateString : new String[]{
-                "May 15, 2016",
-                "Sun, 15 May 2016",
-                "15 May 2016",
-        }) {
-            testDate(dateString, expected);
-        }
-    }
-
-    @Test
-    public void testTrickyDates() throws Exception {
-        DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
-        //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
-        Date date1980 = df.parse("1980-01-01");
-        for (String dateString : new String[] {
-                "Mon, 29 Jan 96 14:02 GMT",
-                "7/20/95 1:12pm",
-                "08/14/2000  12:48 AM",
-                "06/24/2008, Tuesday, 11 AM",
-                "11/14/08",
-                "12/02/1996",
-                "96/12/02",
-        }) {
-            Date parsedDate = getDate(dateString);
-            if (parsedDate != null) {
-                assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime());
-            }
-        }
-        //TODO: mime4j misparses these to pre 1980 dates
-        //"Wed, 27 Dec 95 11:20:40 EST",
-        //"26 Aug 00 11:14:52 EDT"
-        //
-        //We are still misparsing: 8/1/03 to a pre 1980 date
-
-    }
-
-    private void testDate(String dateString, String expected) throws Exception {
-        Date parsedDate = getDate(dateString);
-        assertNotNull("couldn't parse " + dateString, parsedDate);
-        DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'",
-                new DateFormatSymbols(Locale.US));
-        String parsedDateString = df.format(parsedDate);
-        assertEquals("failed to match: "+dateString, expected, parsedDateString);
-    }
-
-    private Date getDate(String dateString) throws Exception {
-        String mail = "From: dev@tika.apache.org\n"+
-                "Date: "+dateString+"\n";
-        Parser p = new RFC822Parser();
-        Metadata m = new Metadata();
-        try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
-            p.parse(is, new DefaultHandler(), m, new ParseContext());
-        }
-        return m.getDate(TikaCoreProperties.CREATED);
-    }
-
-    @Test
-    public void testMultipleSubjects() throws Exception {
-        //adapted from govdocs1 303710.txt
-        String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" +
-                "Subject: 2006N-3502\n" +
-                "Subject: I Urge You to Require Notice of Mercury";
-        Parser p = new RFC822Parser();
-        Metadata m = new Metadata();
-        p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext());
-        assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.*;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RFC822ParserTest extends TikaTest {
+
+    private static InputStream getStream(String name) {
+        InputStream stream = Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream(name);
+        assertNotNull("Test file not found " + name, stream);
+        return stream;
+    }
+
+    @Test
+    public void testSimple() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            //just one body
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            //no multi-part body parts
+            verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+            verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            verify(handler).endDocument();
+            //note no leading spaces, and no quotes
+            assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+                    metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testMultipart() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-multipart");
+        ContentHandler handler = mock(XHTMLContentHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+            verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+            verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            verify(handler).endDocument();
+
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+
+        //repeat, this time looking at content
+        parser = new RFC822Parser();
+        metadata = new Metadata();
+        stream = getStream("test-documents/testRFC822-multipart");
+        handler = new BodyContentHandler();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("body 1"));
+            assertTrue(bodyText.contains("body 2"));
+            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testQuotedPrintable() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_quoted");
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
+            assertTrue(bodyText.contains("Lines can be split like this."));
+            assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
+            assertFalse(bodyText.contains("=")); //there should be no escape sequences
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testBase64() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_base64");
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
+            assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString());
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testI18NHeaders() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of internationalized headers, both
+            //quoted-printable (Q) and Base64 (B).
+            assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
+                    metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("If you can read this you understand the example.",
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("If you can read this you understand the example.",
+                    metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    /**
+     * The from isn't in the usual form.
+     * See TIKA-618
+     */
+    @Test
+    public void testUnusualFromAddress() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_oddfrom");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        parser.parse(stream, handler, metadata, new ParseContext());
+        assertEquals("Saved by Windows Internet Explorer 7",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+                metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Test for TIKA-640, increase header max beyond 10k bytes
+     */
+    @Test
+    public void testLongHeader() throws Exception {
+        StringBuilder inputBuilder = new StringBuilder();
+        for (int i = 0; i < 2000; ++i) {
+            inputBuilder.append( //len > 50
+                    "really really really really really really long name ");
+        }
+        String name = inputBuilder.toString();
+        byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII);
+
+        Parser parser = new RFC822Parser();
+        ContentHandler handler = new DefaultHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(
+                    new ByteArrayInputStream(data), handler, metadata, context);
+            fail();
+        } catch (TikaException expected) {
+        }
+
+        MimeConfig config = new MimeConfig();
+        config.setMaxHeaderLen(-1);
+        config.setMaxLineLen(-1);
+        context.set(MimeConfig.class, config);
+        parser.parse(
+                new ByteArrayInputStream(data), handler, metadata, context);
+        assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
+    }
+
+    /**
+     * Test for TIKA-678 - not all headers may be present
+     */
+    @Test
+    public void testSomeMissingHeaders() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
+        ContentHandler handler = new BodyContentHandler();
+
+        parser.parse(stream, handler, metadata, new ParseContext());
+        assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
+        assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
+        assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
+        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+        assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
+        assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
+        assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
+        assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
+        assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("abcd", metadata.get(Metadata.SUBJECT));
+        assertContains("bar biz bat", handler.toString());
+    }
+
+    /**
+     * Test TIKA-1028 - If the mail contains an encrypted attachment (or
+     * an attachment that others triggers an error), parsing should carry
+     * on for the remainder regardless
+     */
+    @Test
+    public void testEncryptedZipAttachment() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        // Check we go the metadata
+        assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+
+        // Check we got the message text, for both Plain Text and HTML
+        assertContains("Includes encrypted zip file", handler.toString());
+        assertContains("password is \"test\".", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We won't get the contents of the zip file, but we will get the name
+        assertContains("text.txt", handler.toString());
+        assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
+
+        // Try again, this time with the password supplied
+        // Check that we also get the zip's contents as well
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "test";
+            }
+        });
+        stream = getStream("test-documents/testRFC822_encrypted_zip");
+        handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        assertContains("Includes encrypted zip file", handler.toString());
+        assertContains("password is \"test\".", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We do get the name of the file in the encrypted zip file
+        assertContains("text.txt", handler.toString());
+
+        // TODO Upgrade to a version of Commons Compress with Encryption
+        //  support, then verify we get the contents of the text file
+        //  held within the encrypted zip
+        assumeTrue(false); // No Zip Encryption support yet
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("ENCRYPTED ZIP FILES", handler.toString());
+        assertContains("TIKA-1028", handler.toString());
+    }
+
+    /**
+     * Test TIKA-1028 - Ensure we can get the contents of an
+     * un-encrypted zip file
+     */
+    @Test
+    public void testNormalZipAttachment() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = getStream("test-documents/testRFC822_normal_zip");
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+
+        // Check we go the metadata
+        assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+
+        // Check we got the message text, for both Plain Text and HTML
+        assertContains("Includes a normal, unencrypted zip file", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+
+        // We get both name and contents of the zip file's contents
+        assertContains("text.txt", handler.toString());
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("This is text inside an unencrypted zip file", handler.toString());
+        assertContains("TIKA-1028", handler.toString());
+    }
+
+    /**
+     * TIKA-1222 When requested, ensure that the various attachments of
+     * the mail come through properly as embedded resources
+     */
+    @Test
+    public void testGetAttachmentsAsEmbeddedResources() throws Exception {
+        TrackingHandler tracker = new TrackingHandler();
+        ContainerExtractor ex = new ParserContainerExtractor();
+        try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+
+        // Check we found all 3 parts
+        assertEquals(3, tracker.filenames.size());
+        assertEquals(3, tracker.mediaTypes.size());
+
+        // No filenames available
+        assertEquals(null, tracker.filenames.get(0));
+        assertEquals(null, tracker.filenames.get(1));
+        assertEquals(null, tracker.filenames.get(2));
+        // Types are available
+        assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
+        assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
+        assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
+    }
+
+    @Test
+    public void testDetection() throws Exception {
+        //test simple text file
+        XMLResult r = getXML("testRFC822_date_utf8");
+        assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
+
+        //test without extension
+        r = getXML("testRFC822_eml");
+        assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testDates() throws Exception {
+        //tests non-standard dates that mime4j can't parse
+        XMLResult r = getXML("testRFC822_date_utf8");
+        assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+        r = getXML("testRFC822_eml");
+        assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+
+        String expected = "2016-05-15T01:32:00Z";
+
+        for (String dateString : new String[]{
+                "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works
+                "Sun, 15 May 2016 01:32:00", //no timezone
+                "Sunday, May 15 2016 1:32 AM",
+                "May 15 2016 1:32am",
+                "May 15 2016 1:32 am",
+                "2016-05-15 01:32:00",
+                "      Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace
+                "      Sun, 14 May 2016 20:32:00 EST",
+        }) {
+            testDate(dateString, expected);
+        }
+
+        //now try days without times
+        expected = "2016-05-15T12:00:00Z";
+        for (String dateString : new String[]{
+                "May 15, 2016",
+                "Sun, 15 May 2016",
+                "15 May 2016",
+        }) {
+            testDate(dateString, expected);
+        }
+    }
+
+    @Test
+    public void testTrickyDates() throws Exception {
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
+        //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
+        Date date1980 = df.parse("1980-01-01");
+        for (String dateString : new String[] {
+                "Mon, 29 Jan 96 14:02 GMT",
+                "7/20/95 1:12pm",
+                "08/14/2000  12:48 AM",
+                "06/24/2008, Tuesday, 11 AM",
+                "11/14/08",
+                "12/02/1996",
+                "96/12/02",
+        }) {
+            Date parsedDate = getDate(dateString);
+            if (parsedDate != null) {
+                assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime());
+            }
+        }
+        //TODO: mime4j misparses these to pre 1980 dates
+        //"Wed, 27 Dec 95 11:20:40 EST",
+        //"26 Aug 00 11:14:52 EDT"
+        //
+        //We are still misparsing: 8/1/03 to a pre 1980 date
+
+    }
+
+    private void testDate(String dateString, String expected) throws Exception {
+        Date parsedDate = getDate(dateString);
+        assertNotNull("couldn't parse " + dateString, parsedDate);
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'",
+                new DateFormatSymbols(Locale.US));
+        String parsedDateString = df.format(parsedDate);
+        assertEquals("failed to match: "+dateString, expected, parsedDateString);
+    }
+
+    private Date getDate(String dateString) throws Exception {
+        String mail = "From: dev@tika.apache.org\n"+
+                "Date: "+dateString+"\n";
+        Parser p = new RFC822Parser();
+        Metadata m = new Metadata();
+        try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
+            p.parse(is, new DefaultHandler(), m, new ParseContext());
+        }
+        return m.getDate(TikaCoreProperties.CREATED);
+    }
+
+    @Test
+    public void testMultipleSubjects() throws Exception {
+        //adapted from govdocs1 303710.txt
+        String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" +
+                "Subject: 2006N-3502\n" +
+                "Subject: I Urge You to Require Notice of Mercury";
+        Parser p = new RFC822Parser();
+        Metadata m = new Metadata();
+        p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext());
+        assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
+    }
+}

[24/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index 6f13a54..9ca3595 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -1,913 +1,913 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.math.BigInteger;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
-import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * Decompresses a chm block. Depending on chm block type chooses most relevant
- * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
- * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
- * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
- * Currently relying on previous chm block these types changing according to the
- * previous chm block type. We need to invent more appropriate way to handle
- * such types.
- * 
- */
-public class ChmLzxBlock {
-    private int block_number;
-    private long block_length;
-    private ChmLzxState state;
-    private byte[] content = null;
-    private ChmSection chmSection = null;
-    private int contentLength = 0;
-
-    // trying to find solution for bad blocks ...
-    private int previousBlockType = -1;
-
-    public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
-            ChmLzxBlock prevBlock) throws TikaException {
-        try {
-            if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
-                setBlockNumber(blockNumber);
-
-                if (prevBlock != null
-                        && prevBlock.getState().getBlockLength() > prevBlock
-                                .getState().getBlockRemaining())
-                    setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
-                else
-                    setChmSection(new ChmSection(dataSegment));
-
-                setBlockLength(blockLength);
-
-                // ============================================
-                // we need to take care of previous context
-                // ============================================
-                checkLzxBlock(prevBlock);
-                if (prevBlock == null
-                        || blockLength < (int) getBlockLength()) {
-                    setContent((int) getBlockLength());
-                }
-                else {
-                    setContent((int) blockLength);
-                }
-
-                if (prevBlock != null && prevBlock.getState() != null)
-                    previousBlockType = prevBlock.getState().getBlockType();
-
-                extractContent();
-            } else
-                throw new TikaException("Check your chm lzx block parameters");
-        } catch (TikaException e) {
-            throw e;
-        }
-    }
-
-    protected int getContentLength() {
-        return contentLength;
-    }
-
-    protected void setContentLength(int contentLength) {
-        this.contentLength = contentLength;
-    }
-
-    private ChmSection getChmSection() {
-        return chmSection;
-    }
-
-    private void setChmSection(ChmSection chmSection) {
-        this.chmSection = chmSection;
-    }
-
-    private void assertStateNotNull() throws TikaException {
-        if (getState() == null)
-            throw new ChmParsingException("state is null");
-    }
-
-    private void extractContent() throws TikaException {
-        assertStateNotNull();
-        if (getChmSection().getData() != null) {
-            boolean continueLoop = true;
-            while (continueLoop && getContentLength() < getBlockLength()) {
-                if (getState() != null && getState().getBlockRemaining() == 0) {
-                    if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
-                        getState().setHadStarted(LzxState.STARTED_DECODING);
-                        if (getChmSection().getSyncBits(1) == 1) {
-                            int intelSizeTemp = (getChmSection()
-                                    .getSyncBits(16) << 16)
-                                    + getChmSection().getSyncBits(16);
-                            if (intelSizeTemp >= 0)
-                                getState().setIntelFileSize(intelSizeTemp);
-                            else
-                                getState().setIntelFileSize(0);
-                        }
-                    }
-                    getState().setBlockType(getChmSection().getSyncBits(3));
-                    getState().setBlockLength(
-                            (getChmSection().getSyncBits(16) << 8)
-                                    + getChmSection().getSyncBits(8));
-                    getState().setBlockRemaining(getState().getBlockLength());
-
-                    // ----------------------------------------
-                    // Trying to handle 3 - 7 block types
-                    // ----------------------------------------
-                    if (getState().getBlockType() > 3) {
-                        if (previousBlockType >= 0 && previousBlockType < 3)
-                            getState().setBlockType(previousBlockType);
-                    }
-
-                    switch (getState().getBlockType()) {
-                        case ChmCommons.ALIGNED_OFFSET:
-                            createAlignedTreeTable();
-                            //fall through
-                        case ChmCommons.VERBATIM:
-                            /* Creates mainTreeTable */
-                            createMainTreeTable();
-                            createLengthTreeTable();
-                            if (getState().getMainTreeLengtsTable()[0xe8] != 0)
-                                getState().setIntelState(IntelState.STARTED);
-                            break;
-                        case ChmCommons.UNCOMPRESSED:
-                            getState().setIntelState(IntelState.STARTED);
-                            if (getChmSection().getTotal() > 16)
-                                getChmSection().setSwath(
-                                        getChmSection().getSwath() - 1);
-                            getState().setR0(
-                                    (new BigInteger(getChmSection()
-                                            .reverseByteOrder(
-                                                    getChmSection().unmarshalBytes(
-                                                            4))).longValue()));
-                            getState().setR1(
-                                    (new BigInteger(getChmSection()
-                                            .reverseByteOrder(
-                                                    getChmSection().unmarshalBytes(
-                                                            4))).longValue()));
-                            getState().setR2(
-                                    (new BigInteger(getChmSection()
-                                            .reverseByteOrder(
-                                                    getChmSection().unmarshalBytes(
-                                                            4))).longValue()));
-                            break;
-                        default:
-                            break;
-                    }
-                } //end of if BlockRemaining == 0
-
-                int tempLen;
-
-                if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
-                    getState().setBlockRemaining(
-                            getContentLength() + getState().getBlockRemaining()
-                                    - (int) getBlockLength());
-                    tempLen = (int) getBlockLength();
-                } else {
-                    tempLen = getContentLength()
-                            + getState().getBlockRemaining();
-                    getState().setBlockRemaining(0);
-                }
-
-                int lastLength = getContentLength();
-                switch (getState().getBlockType()) {
-                case ChmCommons.ALIGNED_OFFSET:
-                    // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
-                    decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
-                    break;
-                case ChmCommons.VERBATIM:
-                    decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
-                    break;
-                case ChmCommons.UNCOMPRESSED:
-                    decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
-                    break;
-                }
-                getState().increaseFramesRead();
-                if ((getState().getFramesRead() < 32768)
-                        && getState().getIntelFileSize() != 0)
-                    intelE8Decoding();
-
-                continueLoop = getContentLength() > lastLength;
-            }
-        }
-    }
-
-    protected void intelE8Decoding() {
-        if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
-                || (getState().getIntelState() == IntelState.NOT_STARTED)) {
-            getState().setBlockRemaining(
-                    getState().getBlockRemaining() - (int) getBlockLength());
-        } else {
-            long curpos = getState().getBlockRemaining();
-            getState().setBlockRemaining(
-                    getState().getBlockRemaining() - (int) getBlockLength());
-            int i = 0;
-            while (i < getBlockLength() - 10) {
-                if (content[i] != 0xe8) {
-                    i++;
-                    continue;
-                }
-                byte[] b = new byte[4];
-                b[0] = getContent()[i + 3];
-                b[1] = getContent()[i + 2];
-                b[2] = getContent()[i + 1];
-                b[3] = getContent()[i + 0];
-                long absoff = (new BigInteger(b)).longValue();
-                if ((absoff >= -curpos)
-                        && (absoff < getState().getIntelFileSize())) {
-                    long reloff = (absoff >= 0) ? absoff - curpos : absoff
-                            + getState().getIntelFileSize();
-                    getContent()[i + 0] = (byte) reloff;
-                    getContent()[i + 1] = (byte) (reloff >>> 8);
-                    getContent()[i + 2] = (byte) (reloff >>> 16);
-                    getContent()[i + 3] = (byte) (reloff >>> 24);
-                }
-                i += 4;
-                curpos += 5;
-            }
-        }
-    }
-
-    private short[] createPreLenTable() {
-        short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
-        for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
-            tmp[i] = (short) getChmSection().getSyncBits(
-                    ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
-        }
-        return tmp;
-    }
-
-    private void createLengthTreeTable() throws TikaException {
-        //Read Pre Tree Table
-        short[] prelentable = createPreLenTable();
-
-        if (prelentable == null) {
-            throw new ChmParsingException("pretreetable is null");
-        }
-
-        short[] pretreetable = createTreeTable2(prelentable,
-                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
-                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
-                ChmConstants.LZX_PRETREE_TABLEBITS,
-                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
-        if (pretreetable == null) {
-            throw new ChmParsingException("pretreetable is null");
-        }
-
-        //Build Length Tree
-        createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
-                pretreetable, prelentable);
-
-        getState().setLengthTreeTable(
-                createTreeTable2(getState().getLengthTreeLengtsTable(),
-                        (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
-                                + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
-                        ChmConstants.LZX_LENGTH_TABLEBITS,
-                        ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
-    }
-
-    private void decompressUncompressedBlock(int len, byte[] prevcontent) {
-        if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
-            for (int i = getContentLength(); i < (getContentLength() + getState()
-                    .getBlockRemaining()); i++)
-                content[i] = getChmSection().getByte();
-
-            setContentLength(getContentLength()
-                    + getState().getBlockRemaining());
-            getState().setBlockRemaining(0);
-        } else {
-            for (int i = getContentLength(); i < getBlockLength(); i++)
-                content[i] = getChmSection().getByte();
-            getState().setBlockRemaining(
-                    (int) getBlockLength() - getContentLength());// = blockLen -
-                                                                 // contentlen;
-            setContentLength((int) getBlockLength());
-        }
-    }
-
-    private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
-
-        if ((getChmSection() == null) || (getState() == null)
-                || (getState().getMainTreeTable() == null))
-            throw new ChmParsingException("chm section is null");
-
-        short s;
-        int x, i, border;
-        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
-        int matchoffset = 0;
-        for (i = getContentLength(); i < len; i++) {
-            /* new code */
-            //read huffman tree from main tree
-            border = getChmSection().peekBits(
-                    ChmConstants.LZX_MAINTREE_TABLEBITS);
-            if (border >= getState().mainTreeTable.length)
-                throw new ChmParsingException("error decompressing aligned block.");
-                //break;
-            /* end new code */
-            s = getState().mainTreeTable[getChmSection().peekBits(
-                    ChmConstants.LZX_MAINTREE_TABLEBITS)];
-            if (s >= getState().getMainTreeElements()) {
-                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
-                do {
-                    x++;
-                    s <<= 1;
-                    s += getChmSection().checkBit(x);
-                } while ((s = getState().mainTreeTable[s]) >= getState()
-                        .getMainTreeElements());
-            }
-            //System.out.printf("%d,", s);
-            //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
-            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
-            if (s < ChmConstants.LZX_NUM_CHARS) {
-                content[i] = (byte) s;
-            } else {
-                s -= ChmConstants.LZX_NUM_CHARS;
-                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
-                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
-                    matchfooter = getState().lengthTreeTable[getChmSection()
-                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
-                    if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
-                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
-                        do {
-                            x++;
-                            matchfooter <<= 1;
-                            matchfooter += getChmSection().checkBit(x);
-                        } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
-                    }
-                    getChmSection().getSyncBits(
-                            getState().lengthTreeLengtsTable[matchfooter]);
-                    matchlen += matchfooter;
-                }
-                matchlen += ChmConstants.LZX_MIN_MATCH;
-                matchoffset = s >>> 3;
-                if (matchoffset > 2) {
-                    extra = ChmConstants.EXTRA_BITS[matchoffset];
-                    matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
-                    if (extra > 3) {
-                        extra -= 3;
-                        long verbatim_bits = getChmSection().getSyncBits(extra);
-                        matchoffset += (verbatim_bits << 3);
-                        //READ HUFF SYM in Aligned Tree
-                        int aligned_bits = getChmSection().peekBits(
-                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
-                        int t = getState().getAlignedTreeTable()[aligned_bits];
-                        if (t >= getState().getMainTreeElements()) {
-                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
-                            do {
-                                x++;
-                                t <<= 1;
-                                t += getChmSection().checkBit(x);
-                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
-                                    .getMainTreeElements());
-                        }
-                        getChmSection().getSyncBits(
-                                getState().getAlignedLenTable()[t]);
-                        matchoffset += t;
-                    } else if (extra == 3) {
-                        int g = getChmSection().peekBits(
-                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
-                        int t = getState().getAlignedTreeTable()[g];
-                        if (t >= getState().getMainTreeElements()) {
-                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
-                            do {
-                                x++;
-                                t <<= 1;
-                                t += getChmSection().checkBit(x);
-                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
-                                    .getMainTreeElements());
-                        }
-                        getChmSection().getSyncBits(
-                                getState().getAlignedLenTable()[t]);
-                        matchoffset += t;
-                    } else if (extra > 0) {
-                        long l = getChmSection().getSyncBits(extra);
-                        matchoffset += l;
-                    } else
-                        matchoffset = 1;
-                    getState().setR2(getState().getR1());
-                    getState().setR1(getState().getR0());
-                    getState().setR0(matchoffset);
-                } else if (matchoffset == 0) {
-                    matchoffset = (int) getState().getR0();
-                } else if (matchoffset == 1) {
-                    matchoffset = (int) getState().getR1();
-                    getState().setR1(getState().getR0());
-                    getState().setR0(matchoffset);
-                } else /** match_offset == 2 */
-                {
-                    matchoffset = (int) getState().getR2();
-                    getState().setR2(getState().getR0());
-                    getState().setR0(matchoffset);
-                }
-                rundest = i;
-                runsrc = rundest - matchoffset;
-                i += (matchlen - 1);
-                if (i > len)
-                    break;
-
-                if (runsrc < 0) {
-                    if (matchlen + runsrc <= 0) {
-                        runsrc = prevcontent.length + runsrc;
-                        while (matchlen-- > 0)
-                            content[rundest++] = prevcontent[runsrc++];
-                    } else {
-                        runsrc = prevcontent.length + runsrc;
-                        while (runsrc < prevcontent.length)
-                            content[rundest++] = prevcontent[runsrc++];
-                        matchlen = matchlen + runsrc - prevcontent.length;
-                        runsrc = 0;
-                        while (matchlen-- > 0)
-                            content[rundest++] = content[runsrc++];
-                    }
-
-                } else {
-                    /* copies any wrappes around source data */
-                    while ((runsrc < 0) && (matchlen-- > 0)) {
-                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
-                        runsrc++;
-                    }
-                    /* copies match data - no worries about destination wraps */
-                    while (matchlen-- > 0)
-                        content[rundest++] = content[runsrc++];
-                }
-            }
-        }
-        setContentLength(len);
-    }
-
-    private void assertShortArrayNotNull(short[] array) throws TikaException {
-        if (array == null)
-            throw new ChmParsingException("short[] is null");
-    }
-
-    private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
-        short s;
-        int x, i;
-        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
-        int matchoffset = 0;
-        for (i = getContentLength(); i < len; i++) {
-            int f = getChmSection().peekBits(
-                    ChmConstants.LZX_MAINTREE_TABLEBITS);
-            assertShortArrayNotNull(getState().getMainTreeTable());
-            s = getState().getMainTreeTable()[f];
-            if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
-                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
-                do {
-                    x++;
-                    s <<= 1;
-                    s += getChmSection().checkBit(x);
-                } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
-            }
-            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
-            if (s < ChmConstants.LZX_NUM_CHARS) {
-                content[i] = (byte) s;
-            } else {
-                s -= ChmConstants.LZX_NUM_CHARS;
-                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
-                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
-                    matchfooter = getState().getLengthTreeTable()[getChmSection()
-                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
-                    if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
-                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
-                        do {
-                            x++;
-                            matchfooter <<= 1;
-                            matchfooter += getChmSection().checkBit(x);
-                        } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
-                    }
-                    getChmSection().getSyncBits(
-                            getState().getLengthTreeLengtsTable()[matchfooter]);
-                    matchlen += matchfooter;
-                }
-                matchlen += ChmConstants.LZX_MIN_MATCH;
-                // shorter than 2
-                matchoffset = s >>> 3;
-                if (matchoffset > 2) {
-                    if (matchoffset != 3) { // should get other bits to retrieve
-                                            // offset
-                        extra = ChmConstants.EXTRA_BITS[matchoffset];
-                        long l = getChmSection().getSyncBits(extra);
-                        matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
-                    } else {
-                        matchoffset = 1;
-                    }
-                    getState().setR2(getState().getR1());
-                    getState().setR1(getState().getR0());
-                    getState().setR0(matchoffset);
-                } else if (matchoffset == 0) {
-                    matchoffset = (int) getState().getR0();
-                } else if (matchoffset == 1) {
-                    matchoffset = (int) getState().getR1();
-                    getState().setR1(getState().getR0());
-                    getState().setR0(matchoffset);
-                } else /* match_offset == 2 */
-                {
-                    matchoffset = (int) getState().getR2();
-                    getState().setR2(getState().getR0());
-                    getState().setR0(matchoffset);
-                }
-                rundest = i;
-                runsrc = rundest - matchoffset;
-                i += (matchlen - 1);
-                if (i > len)
-                    break;
-                if (runsrc < 0) {
-                    if (matchlen + runsrc <= 0) {
-                        runsrc = prevcontent.length + runsrc;
-                        while ((matchlen-- > 0) && (prevcontent != null)
-                                && ((runsrc + 1) > 0))
-                            if ((rundest < content.length)
-                                    && (runsrc < content.length))
-                                content[rundest++] = prevcontent[runsrc++];
-                    } else {
-                        runsrc = prevcontent.length + runsrc;
-                        while (runsrc < prevcontent.length)
-                            if ((rundest < content.length)
-                                    && (runsrc < content.length))
-                                content[rundest++] = prevcontent[runsrc++];
-                        matchlen = matchlen + runsrc - prevcontent.length;
-                        runsrc = 0;
-                        while (matchlen-- > 0)
-                            content[rundest++] = content[runsrc++];
-                    }
-
-                } else {
-                    /* copies any wrapped source data */
-                    while ((runsrc < 0) && (matchlen-- > 0)) {
-                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
-                        runsrc++;
-                    }
-                    /* copies match data - no worries about destination wraps */
-                    while (matchlen-- > 0) {
-                        if ((rundest < content.length)
-                                && (runsrc < content.length))
-                            content[rundest++] = content[runsrc++];
-                    }
-                }
-            }
-        }
-        setContentLength(len);
-    }
-
-    private void createLengthTreeLenTable(int offset, int tablelen,
-            short[] pretreetable, short[] prelentable) throws TikaException {
-        if (prelentable == null || getChmSection() == null
-                || pretreetable == null || prelentable == null)
-            throw new ChmParsingException("is null");
-
-        int i = offset; // represents offset
-        int z, y, x;// local counters
-        while (i < tablelen) {
-            //Read HUFF sym to z
-            z = pretreetable[getChmSection().peekBits(
-                    ChmConstants.LZX_PRETREE_TABLEBITS)];
-            if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
-                                                             // 20
-                x = ChmConstants.LZX_PRETREE_TABLEBITS;
-                do {
-                    x++;
-                    z <<= 1;
-                    z += getChmSection().checkBit(x);
-                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
-            }
-            getChmSection().getSyncBits(prelentable[z]);
-            
-            if (z < 17) {
-                z = getState().getLengthTreeLengtsTable()[i] - z;
-                if (z < 0)
-                    z = z + 17;
-                getState().getLengthTreeLengtsTable()[i] = (short) z;
-                i++;
-            } else if (z == 17) {
-                y = getChmSection().getSyncBits(4);
-                y += 4;
-                for (int j = 0; j < y; j++)
-                    if (i < getState().getLengthTreeLengtsTable().length)
-                        getState().getLengthTreeLengtsTable()[i++] = 0;
-            } else if (z == 18) {
-                y = getChmSection().getSyncBits(5);
-                y += 20;
-                for (int j = 0; j < y; j++)
-                    //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
-                        getState().getLengthTreeLengtsTable()[i++] = 0;
-            } else if (z == 19) {
-                y = getChmSection().getSyncBits(1);
-                y += 4;
-                z = pretreetable[getChmSection().peekBits(
-                        ChmConstants.LZX_PRETREE_TABLEBITS)];
-                if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
-                    x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
-                    do {
-                        x++;
-                        z <<= 1;
-                        z += getChmSection().checkBit(x);
-                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
-                }
-                getChmSection().getSyncBits(prelentable[z]);
-                z = getState().getLengthTreeLengtsTable()[i] - z;
-                if (z < 0)
-                    z = z + 17;
-                for (int j = 0; j < y; j++)
-                    getState().getLengthTreeLengtsTable()[i++] = (short) z;
-            }
-        }
-    }
-
-    private void createMainTreeTable() throws TikaException {
-        //Read Pre Tree Table
-        short[] prelentable = createPreLenTable();
-        short[] pretreetable = createTreeTable2(prelentable,
-                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
-                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
-                ChmConstants.LZX_PRETREE_TABLEBITS,
-                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
-        createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
-                prelentable);
-        
-        //Read Pre Tree Table
-        prelentable = createPreLenTable();
-        pretreetable = createTreeTable2(prelentable,
-                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
-                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
-                ChmConstants.LZX_PRETREE_TABLEBITS,
-                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
-        createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
-                getState().mainTreeLengtsTable.length, pretreetable,
-                prelentable);
-
-        getState().setMainTreeTable(
-                createTreeTable2(getState().mainTreeLengtsTable,
-                        (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
-                                + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
-                        ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
-                                .getMainTreeElements()));
-    }
-
-    private void createMainTreeLenTable(int offset, int tablelen,
-            short[] pretreetable, short[] prelentable) throws TikaException {
-        if (pretreetable == null)
-            throw new ChmParsingException("pretreetable is null");
-        int i = offset;
-        int z, y, x;
-        while (i < tablelen) {
-            int f = getChmSection().peekBits(
-                    ChmConstants.LZX_PRETREE_TABLEBITS);
-            z = pretreetable[f];
-            if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
-                x = ChmConstants.LZX_PRETREE_TABLEBITS;
-                do {
-                    x++;
-                    z <<= 1;
-                    z += getChmSection().checkBit(x);
-                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-            }
-            getChmSection().getSyncBits(prelentable[z]);
-            if (z < 17) {
-                z = getState().getMainTreeLengtsTable()[i] - z;
-                if (z < 0)
-                    z = z + 17;
-                getState().mainTreeLengtsTable[i] = (short) z;
-                i++;
-            } else if (z == 17) {
-                y = getChmSection().getSyncBits(4);
-                y += 4;
-                for (int j = 0; j < y; j++) {
-                    assertInRange(getState().getMainTreeLengtsTable(), i);
-                    getState().mainTreeLengtsTable[i++] = 0;
-                }
-            } else if (z == 18) {
-                y = getChmSection().getSyncBits(5);
-                y += 20;
-                for (int j = 0; j < y; j++) {
-                    assertInRange(getState().getMainTreeLengtsTable(), i);
-                    getState().mainTreeLengtsTable[i++] = 0;
-                }
-            } else if (z == 19) {
-                y = getChmSection().getSyncBits(1);
-                y += 4;
-                z = pretreetable[getChmSection().peekBits(
-                        ChmConstants.LZX_PRETREE_TABLEBITS)];
-                if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
-                    x = ChmConstants.LZX_PRETREE_TABLEBITS;
-                    do {
-                        x++;
-                        z <<= 1;
-                        z += getChmSection().checkBit(x);
-                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-                }
-                getChmSection().getSyncBits(prelentable[z]);
-                z = getState().mainTreeLengtsTable[i] - z;
-                if (z < 0)
-                    z = z + 17;
-                for (int j = 0; j < y; j++)
-                    if (i < getState().getMainTreeLengtsTable().length)
-                        getState().mainTreeLengtsTable[i++] = (short) z;
-            }
-        }
-    }
-
-    private void assertInRange(short[] array, int index) throws ChmParsingException {
-        if (index >= array.length)
-            throw new ChmParsingException(index + " is bigger than "
-                    + array.length);
-    }
-
-    private short[] createAlignedLenTable() {
-        int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
-        int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
-        short[] tmp = new short[tablelen];
-        for (int i = 0; i < tablelen; i++) {
-            tmp[i] = (short) getChmSection().getSyncBits(bits);
-        }
-        return tmp;
-    }
-
-    private void createAlignedTreeTable() throws ChmParsingException {
-        getState().setAlignedLenTable(createAlignedLenTable());
-        getState().setAlignedTreeTable(//setAlignedLenTable(
-                createTreeTable2(getState().getAlignedLenTable(),
-                        (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
-                                + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
-                        ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
-                        ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
-    }
-
-    private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
-            int maxsymbol) throws ChmParsingException {
-        short[] tmp = new short[tablelen];
-        short sym;
-        int leaf;
-        int bit_num = 1;
-        long fill;
-        int pos = 0;
-        /* the current position in the decode table */
-        long table_mask = (1 << bits);
-        long bit_mask = (table_mask >> 1);
-        long next_symbol = bit_mask;
-
-        /* fills entries for short codes for a direct mapping */
-        while (bit_num <= bits) {
-            for (sym = 0; sym < maxsymbol; sym++) {
-                if (lentable.length > sym && lentable[sym] == bit_num) {
-                    leaf = pos;
-
-                    if ((pos += bit_mask) > table_mask) {
-                        /* table overflow */
-                        throw new ChmParsingException("Table overflow");
-                    }
-
-                    fill = bit_mask;
-                    while (fill-- > 0)
-                        tmp[leaf++] = sym;
-                }
-            }
-            bit_mask >>= 1;
-            bit_num++;
-        }
-
-        /* if there are any codes longer than nbits */
-        if (pos != table_mask) {
-            /* clears the remainder of the table */
-            for (leaf = pos; leaf < table_mask; leaf++)
-                tmp[leaf] = 0;
-
-            /* gives ourselves room for codes to grow by up to 16 more bits */
-            pos <<= 16;
-            table_mask <<= 16;
-            bit_mask = 1 << 15;
-
-            while (bit_num <= 16) {
-                for (sym = 0; sym < maxsymbol; sym++) {
-                    if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
-                        leaf = pos >> 16;
-                        for (fill = 0; fill < bit_num - bits; fill++) {
-                            /*
-                             * if this path hasn't been taken yet, 'allocate'
-                             * two entries
-                             */
-                            if (tmp[leaf] == 0) {
-                                if (((next_symbol << 1) + 1) < tmp.length) {
-                                    tmp[(int) (next_symbol << 1)] = 0;
-                                    tmp[(int) (next_symbol << 1) + 1] = 0;
-                                    tmp[leaf] = (short) next_symbol++;
-                                }
-
-                            }
-                            /*
-                             * follows the path and select either left or right
-                             * for next bit
-                             */
-                            leaf = tmp[leaf] << 1;
-                            if (((pos >> (15 - fill)) & 1) != 0)
-                                leaf++;
-                        }
-                        tmp[leaf] = sym;
-
-                        if ((pos += bit_mask) > table_mask) {
-                            /* table overflow */
-                            throw new ChmParsingException("Table overflow");
-                        }
-                    }
-                }
-                bit_mask >>= 1;
-                bit_num++;
-            }
-        }
-
-        /* is it full table? */
-        if (pos == table_mask)
-            return tmp;
-
-        return tmp;
-    }
-
-    public byte[] getContent() {
-        return content;
-    }
-
-    public byte[] getContent(int startOffset, int endOffset) {
-        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
-                startOffset, endOffset) : new byte[1];
-    }
-
-    public byte[] getContent(int start) {
-        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
-                start, getContent().length) : new byte[1];
-    }
-
-    private void setContent(int contentLength) {
-        this.content = new byte[contentLength];
-    }
-
-    private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
-        if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
-            setState(new ChmLzxState((int) getBlockLength()));
-        else
-            //use clone to avoid changing a cached or to be cached block
-            setState(chmPrevLzxBlock.getState().clone()); 
-    }
-
-    private boolean validateConstructorParams(int blockNumber,
-            byte[] dataSegment, long blockLength) throws TikaException {
-        int goodParameter = 0;
-        if (blockNumber >= 0)
-            ++goodParameter;
-        else
-            throw new ChmParsingException("block number should be possitive");
-        if (dataSegment != null && dataSegment.length > 0)
-            ++goodParameter;
-        else
-            throw new ChmParsingException("data segment should not be null");
-        if (blockLength > 0)
-            ++goodParameter;
-        else
-            throw new ChmParsingException(
-                    "block length should be more than zero");
-        return (goodParameter == 3);
-    }
-
-    public int getBlockNumber() {
-        return block_number;
-    }
-
-    private void setBlockNumber(int block_number) {
-        this.block_number = block_number;
-    }
-
-    private long getBlockLength() {
-        return block_length;
-    }
-
-    private void setBlockLength(long block_length) {
-        this.block_length = block_length;
-    }
-
-    public ChmLzxState getState() {
-        return state;
-    }
-
-    private void setState(ChmLzxState state) {
-        this.state = state;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ * 
+ */
+public class ChmLzxBlock {
+    private int block_number;
+    private long block_length;
+    private ChmLzxState state;
+    private byte[] content = null;
+    private ChmSection chmSection = null;
+    private int contentLength = 0;
+
+    // trying to find solution for bad blocks ...
+    private int previousBlockType = -1;
+
+    public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+            ChmLzxBlock prevBlock) throws TikaException {
+        try {
+            if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+                setBlockNumber(blockNumber);
+
+                if (prevBlock != null
+                        && prevBlock.getState().getBlockLength() > prevBlock
+                                .getState().getBlockRemaining())
+                    setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
+                else
+                    setChmSection(new ChmSection(dataSegment));
+
+                setBlockLength(blockLength);
+
+                // ============================================
+                // we need to take care of previous context
+                // ============================================
+                checkLzxBlock(prevBlock);
+                if (prevBlock == null
+                        || blockLength < (int) getBlockLength()) {
+                    setContent((int) getBlockLength());
+                }
+                else {
+                    setContent((int) blockLength);
+                }
+
+                if (prevBlock != null && prevBlock.getState() != null)
+                    previousBlockType = prevBlock.getState().getBlockType();
+
+                extractContent();
+            } else
+                throw new TikaException("Check your chm lzx block parameters");
+        } catch (TikaException e) {
+            throw e;
+        }
+    }
+
+    protected int getContentLength() {
+        return contentLength;
+    }
+
+    protected void setContentLength(int contentLength) {
+        this.contentLength = contentLength;
+    }
+
+    private ChmSection getChmSection() {
+        return chmSection;
+    }
+
+    private void setChmSection(ChmSection chmSection) {
+        this.chmSection = chmSection;
+    }
+
+    private void assertStateNotNull() throws TikaException {
+        if (getState() == null)
+            throw new ChmParsingException("state is null");
+    }
+
+    private void extractContent() throws TikaException {
+        assertStateNotNull();
+        if (getChmSection().getData() != null) {
+            boolean continueLoop = true;
+            while (continueLoop && getContentLength() < getBlockLength()) {
+                if (getState() != null && getState().getBlockRemaining() == 0) {
+                    if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+                        getState().setHadStarted(LzxState.STARTED_DECODING);
+                        if (getChmSection().getSyncBits(1) == 1) {
+                            int intelSizeTemp = (getChmSection()
+                                    .getSyncBits(16) << 16)
+                                    + getChmSection().getSyncBits(16);
+                            if (intelSizeTemp >= 0)
+                                getState().setIntelFileSize(intelSizeTemp);
+                            else
+                                getState().setIntelFileSize(0);
+                        }
+                    }
+                    getState().setBlockType(getChmSection().getSyncBits(3));
+                    getState().setBlockLength(
+                            (getChmSection().getSyncBits(16) << 8)
+                                    + getChmSection().getSyncBits(8));
+                    getState().setBlockRemaining(getState().getBlockLength());
+
+                    // ----------------------------------------
+                    // Trying to handle 3 - 7 block types
+                    // ----------------------------------------
+                    if (getState().getBlockType() > 3) {
+                        if (previousBlockType >= 0 && previousBlockType < 3)
+                            getState().setBlockType(previousBlockType);
+                    }
+
+                    switch (getState().getBlockType()) {
+                        case ChmCommons.ALIGNED_OFFSET:
+                            createAlignedTreeTable();
+                            //fall through
+                        case ChmCommons.VERBATIM:
+                            /* Creates mainTreeTable */
+                            createMainTreeTable();
+                            createLengthTreeTable();
+                            if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+                                getState().setIntelState(IntelState.STARTED);
+                            break;
+                        case ChmCommons.UNCOMPRESSED:
+                            getState().setIntelState(IntelState.STARTED);
+                            if (getChmSection().getTotal() > 16)
+                                getChmSection().setSwath(
+                                        getChmSection().getSwath() - 1);
+                            getState().setR0(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR1(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR2(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            break;
+                        default:
+                            break;
+                    }
+                } //end of if BlockRemaining == 0
+
+                int tempLen;
+
+                if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+                    getState().setBlockRemaining(
+                            getContentLength() + getState().getBlockRemaining()
+                                    - (int) getBlockLength());
+                    tempLen = (int) getBlockLength();
+                } else {
+                    tempLen = getContentLength()
+                            + getState().getBlockRemaining();
+                    getState().setBlockRemaining(0);
+                }
+
+                int lastLength = getContentLength();
+                switch (getState().getBlockType()) {
+                case ChmCommons.ALIGNED_OFFSET:
+                    // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+                    decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
+                    break;
+                case ChmCommons.VERBATIM:
+                    decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+                    break;
+                case ChmCommons.UNCOMPRESSED:
+                    decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+                    break;
+                }
+                getState().increaseFramesRead();
+                if ((getState().getFramesRead() < 32768)
+                        && getState().getIntelFileSize() != 0)
+                    intelE8Decoding();
+
+                continueLoop = getContentLength() > lastLength;
+            }
+        }
+    }
+
+    protected void intelE8Decoding() {
+        if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+                || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+        } else {
+            long curpos = getState().getBlockRemaining();
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+            int i = 0;
+            while (i < getBlockLength() - 10) {
+                if (content[i] != 0xe8) {
+                    i++;
+                    continue;
+                }
+                byte[] b = new byte[4];
+                b[0] = getContent()[i + 3];
+                b[1] = getContent()[i + 2];
+                b[2] = getContent()[i + 1];
+                b[3] = getContent()[i + 0];
+                long absoff = (new BigInteger(b)).longValue();
+                if ((absoff >= -curpos)
+                        && (absoff < getState().getIntelFileSize())) {
+                    long reloff = (absoff >= 0) ? absoff - curpos : absoff
+                            + getState().getIntelFileSize();
+                    getContent()[i + 0] = (byte) reloff;
+                    getContent()[i + 1] = (byte) (reloff >>> 8);
+                    getContent()[i + 2] = (byte) (reloff >>> 16);
+                    getContent()[i + 3] = (byte) (reloff >>> 24);
+                }
+                i += 4;
+                curpos += 5;
+            }
+        }
+    }
+
+    private short[] createPreLenTable() {
+        short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+        for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(
+                    ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+        }
+        return tmp;
+    }
+
+    private void createLengthTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+
+        if (prelentable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        if (pretreetable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        //Build Length Tree
+        createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+                pretreetable, prelentable);
+
+        getState().setLengthTreeTable(
+                createTreeTable2(getState().getLengthTreeLengtsTable(),
+                        (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+                                + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_LENGTH_TABLEBITS,
+                        ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+    }
+
+    private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+        if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+            for (int i = getContentLength(); i < (getContentLength() + getState()
+                    .getBlockRemaining()); i++)
+                content[i] = getChmSection().getByte();
+
+            setContentLength(getContentLength()
+                    + getState().getBlockRemaining());
+            getState().setBlockRemaining(0);
+        } else {
+            for (int i = getContentLength(); i < getBlockLength(); i++)
+                content[i] = getChmSection().getByte();
+            getState().setBlockRemaining(
+                    (int) getBlockLength() - getContentLength());// = blockLen -
+                                                                 // contentlen;
+            setContentLength((int) getBlockLength());
+        }
+    }
+
+    private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
+
+        if ((getChmSection() == null) || (getState() == null)
+                || (getState().getMainTreeTable() == null))
+            throw new ChmParsingException("chm section is null");
+
+        short s;
+        int x, i, border;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            /* new code */
+            //read huffman tree from main tree
+            border = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            if (border >= getState().mainTreeTable.length)
+                throw new ChmParsingException("error decompressing aligned block.");
+                //break;
+            /* end new code */
+            s = getState().mainTreeTable[getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS)];
+            if (s >= getState().getMainTreeElements()) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().mainTreeTable[s]) >= getState()
+                        .getMainTreeElements());
+            }
+            //System.out.printf("%d,", s);
+            //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = getState().lengthTreeTable[getChmSection()
+                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+                    if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            getState().lengthTreeLengtsTable[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    extra = ChmConstants.EXTRA_BITS[matchoffset];
+                    matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+                    if (extra > 3) {
+                        extra -= 3;
+                        long verbatim_bits = getChmSection().getSyncBits(extra);
+                        matchoffset += (verbatim_bits << 3);
+                        //READ HUFF SYM in Aligned Tree
+                        int aligned_bits = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[aligned_bits];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra == 3) {
+                        int g = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[g];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra > 0) {
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset += l;
+                    } else
+                        matchoffset = 1;
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /** match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while (matchlen-- > 0)
+                            content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrappes around source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps */
+                    while (matchlen-- > 0)
+                        content[rundest++] = content[runsrc++];
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void assertShortArrayNotNull(short[] array) throws TikaException {
+        if (array == null)
+            throw new ChmParsingException("short[] is null");
+    }
+
+    private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
+        short s;
+        int x, i;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            assertShortArrayNotNull(getState().getMainTreeTable());
+            s = getState().getMainTreeTable()[f];
+            if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+            }
+            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = getState().getLengthTreeTable()[getChmSection()
+                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+                    if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            getState().getLengthTreeLengtsTable()[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                // shorter than 2
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    if (matchoffset != 3) { // should get other bits to retrieve
+                                            // offset
+                        extra = ChmConstants.EXTRA_BITS[matchoffset];
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+                    } else {
+                        matchoffset = 1;
+                    }
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /* match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while ((matchlen-- > 0) && (prevcontent != null)
+                                && ((runsrc + 1) > 0))
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrapped source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps */
+                    while (matchlen-- > 0) {
+                        if ((rundest < content.length)
+                                && (runsrc < content.length))
+                            content[rundest++] = content[runsrc++];
+                    }
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void createLengthTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (prelentable == null || getChmSection() == null
+                || pretreetable == null || prelentable == null)
+            throw new ChmParsingException("is null");
+
+        int i = offset; // represents offset
+        int z, y, x;// local counters
+        while (i < tablelen) {
+            //Read HUFF sym to z
+            z = pretreetable[getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS)];
+            if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+                                                             // 20
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            
+            if (z < 17) {
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().getLengthTreeLengtsTable()[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++)
+                    //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    getState().getLengthTreeLengtsTable()[i++] = (short) z;
+            }
+        }
+    }
+
+    private void createMainTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+                prelentable);
+        
+        //Read Pre Tree Table
+        prelentable = createPreLenTable();
+        pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+                getState().mainTreeLengtsTable.length, pretreetable,
+                prelentable);
+
+        getState().setMainTreeTable(
+                createTreeTable2(getState().mainTreeLengtsTable,
+                        (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+                                + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+                                .getMainTreeElements()));
+    }
+
+    private void createMainTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (pretreetable == null)
+            throw new ChmParsingException("pretreetable is null");
+        int i = offset;
+        int z, y, x;
+        while (i < tablelen) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS);
+            z = pretreetable[f];
+            if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            if (z < 17) {
+                z = getState().getMainTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().mainTreeLengtsTable[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().mainTreeLengtsTable[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getMainTreeLengtsTable().length)
+                        getState().mainTreeLengtsTable[i++] = (short) z;
+            }
+        }
+    }
+
+    private void assertInRange(short[] array, int index) throws ChmParsingException {
+        if (index >= array.length)
+            throw new ChmParsingException(index + " is bigger than "
+                    + array.length);
+    }
+
+    private short[] createAlignedLenTable() {
+        int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+        int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+        short[] tmp = new short[tablelen];
+        for (int i = 0; i < tablelen; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(bits);
+        }
+        return tmp;
+    }
+
+    private void createAlignedTreeTable() throws ChmParsingException {
+        getState().setAlignedLenTable(createAlignedLenTable());
+        getState().setAlignedTreeTable(//setAlignedLenTable(
+                createTreeTable2(getState().getAlignedLenTable(),
+                        (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+                                + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+                        ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+    }
+
+    private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+            int maxsymbol) throws ChmParsingException {
+        short[] tmp = new short[tablelen];
+        short sym;
+        int leaf;
+        int bit_num = 1;
+        long fill;
+        int pos = 0;
+        /* the current position in the decode table */
+        long table_mask = (1 << bits);
+        long bit_mask = (table_mask >> 1);
+        long next_symbol = bit_mask;
+
+        /* fills entries for short codes for a direct mapping */
+        while (bit_num <= bits) {
+            for (sym = 0; sym < maxsymbol; sym++) {
+                if (lentable.length > sym && lentable[sym] == bit_num) {
+                    leaf = pos;
+
+                    if ((pos += bit_mask) > table_mask) {
+                        /* table overflow */
+                        throw new ChmParsingException("Table overflow");
+                    }
+
+                    fill = bit_mask;
+                    while (fill-- > 0)
+                        tmp[leaf++] = sym;
+                }
+            }
+            bit_mask >>= 1;
+            bit_num++;
+        }
+
+        /* if there are any codes longer than nbits */
+        if (pos != table_mask) {
+            /* clears the remainder of the table */
+            for (leaf = pos; leaf < table_mask; leaf++)
+                tmp[leaf] = 0;
+
+            /* gives ourselves room for codes to grow by up to 16 more bits */
+            pos <<= 16;
+            table_mask <<= 16;
+            bit_mask = 1 << 15;
+
+            while (bit_num <= 16) {
+                for (sym = 0; sym < maxsymbol; sym++) {
+                    if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+                        leaf = pos >> 16;
+                        for (fill = 0; fill < bit_num - bits; fill++) {
+                            /*
+                             * if this path hasn't been taken yet, 'allocate'
+                             * two entries
+                             */
+                            if (tmp[leaf] == 0) {
+                                if (((next_symbol << 1) + 1) < tmp.length) {
+                                    tmp[(int) (next_symbol << 1)] = 0;
+                                    tmp[(int) (next_symbol << 1) + 1] = 0;
+                                    tmp[leaf] = (short) next_symbol++;
+                                }
+
+                            }
+                            /*
+                             * follows the path and select either left or right
+                             * for next bit
+                             */
+                            leaf = tmp[leaf] << 1;
+                            if (((pos >> (15 - fill)) & 1) != 0)
+                                leaf++;
+                        }
+                        tmp[leaf] = sym;
+
+                        if ((pos += bit_mask) > table_mask) {
+                            /* table overflow */
+                            throw new ChmParsingException("Table overflow");
+                        }
+                    }
+                }
+                bit_mask >>= 1;
+                bit_num++;
+            }
+        }
+
+        /* is it full table? */
+        if (pos == table_mask)
+            return tmp;
+
+        return tmp;
+    }
+
+    public byte[] getContent() {
+        return content;
+    }
+
+    public byte[] getContent(int startOffset, int endOffset) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                startOffset, endOffset) : new byte[1];
+    }
+
+    public byte[] getContent(int start) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                start, getContent().length) : new byte[1];
+    }
+
+    private void setContent(int contentLength) {
+        this.content = new byte[contentLength];
+    }
+
+    private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
+        if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+            setState(new ChmLzxState((int) getBlockLength()));
+        else
+            //use clone to avoid changing a cached or to be cached block
+            setState(chmPrevLzxBlock.getState().clone()); 
+    }
+
+    private boolean validateConstructorParams(int blockNumber,
+            byte[] dataSegment, long blockLength) throws TikaException {
+        int goodParameter = 0;
+        if (blockNumber >= 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("block number should be possitive");
+        if (dataSegment != null && dataSegment.length > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("data segment should not be null");
+        if (blockLength > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException(
+                    "block length should be more than zero");
+        return (goodParameter == 3);
+    }
+
+    public int getBlockNumber() {
+        return block_number;
+    }
+
+    private void setBlockNumber(int block_number) {
+        this.block_number = block_number;
+    }
+
+    private long getBlockLength() {
+        return block_length;
+    }
+
+    private void setBlockLength(long block_length) {
+        this.block_length = block_length;
+    }
+
+    public ChmLzxState getState() {
+        return state;
+    }
+
+    private void setState(ChmLzxState state) {
+        this.state = state;
+    }
+}

[27/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
index 2319ec8..10b00ae 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
@@ -1,548 +1,548 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Directory header The directory starts with a header; its format is as
- * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
- * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
- * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
- * Depth of the index tree - 1 there is no index, 2 if there is one level of
- * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
- * (though at least one file has 0 despite there being no index chunk, probably
- * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
- * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
- * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
- * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
- * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
- * DWORD -1 (unknown)
- * 
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1}
- * 
- */
-public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
-    // TODO: refactor all unmarshals
-    private static final long serialVersionUID = 1962394421998181341L;
-    private byte[] signature;
-    private int version; /* 4 */
-    private int header_len; /* 8 */
-    private int unknown_000c; /* c */
-    private long block_len; /* 10 */
-    private int blockidx_intvl; /* 14 */
-    private int index_depth; /* 18 */
-    private int index_root; /* 1c */
-    private int index_head; /* 20 */
-    private int unknown_0024; /* 24 */
-    private long num_blocks; /* 28 */
-    private int unknown_002c; /* 2c */
-    private long lang_id; /* 30 */
-    private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
-    private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    public ChmItspHeader() {
-        signature = ChmConstants.ITSP.getBytes(UTF_8); /*
-                                                        * 0
-                                                        * (ITSP
-                                                        * )
-                                                        */
-    }
-
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("[ signature:=" + new String(getSignature(), UTF_8)
-                + System.getProperty("line.separator"));
-        sb.append("version:=\t" + getVersion()
-                + System.getProperty("line.separator"));
-        sb.append("header_len:=\t" + getHeader_len()
-                + System.getProperty("line.separator"));
-        sb.append("unknown_00c:=\t" + getUnknown_000c()
-                + System.getProperty("line.separator"));
-        sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
-                + System.getProperty("line.separator"));
-        sb.append("blockidx_intvl:=" + getBlockidx_intvl()
-                + ", density of quickref section, usually 2"
-                + System.getProperty("line.separator"));
-        sb.append("index_depth:=\t"
-                + getIndex_depth()
-                + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
-                + System.getProperty("line.separator"));
-        sb.append("index_root:=\t" + getIndex_root()
-                + ", chunk number of root index chunk, -1 if there is none"
-                + System.getProperty("line.separator"));
-        sb.append("index_head:=\t" + getIndex_head()
-                + ", chunk number of first PMGL (listing) chunk"
-                + System.getProperty("line.separator"));
-        sb.append("unknown_0024:=\t" + getUnknown_0024()
-                + ", chunk number of last PMGL (listing) chunk"
-                + System.getProperty("line.separator"));
-        sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
-                + System.getProperty("line.separator"));
-        sb.append("unknown_002c:=\t" + getUnknown_002c()
-                + ", number of directory chunks (total)"
-                + System.getProperty("line.separator"));
-        sb.append("lang_id:=\t" + getLang_id() + " - "
-                + ChmCommons.getLanguage(getLang_id())
-                + System.getProperty("line.separator"));
-        sb.append("system_uuid:=" + getSystem_uuid()
-                + System.getProperty("line.separator"));
-        sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
-        return sb.toString();
-    }
-
-    /**
-     * Copies 4 bits from data[]
-     * 
-     * @param data
-     * @param chmItspHeader
-     * @param count
-     * @throws TikaException 
-     */
-    private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
-            int count) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        ChmAssert.assertChmAccessorNotNull(chmItspHeader);
-        this.setDataRemained(data.length);
-        System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-    }
-
-    private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        if (4 > this.getDataRemained())
-            throw new TikaException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        this.setDataRemained(this.getDataRemained() - 4);
-        return dest;
-    }
-
-    private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        if (4 > dataLenght)
-            throw new TikaException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
-            int count) {
-        System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-        return dest;
-    }
-
-    /**
-     * Returns how many bytes remained
-     * 
-     * @return int
-     */
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    /**
-     * Sets how many bytes remained
-     * 
-     * @param dataRemained
-     */
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    /**
-     * Returns a place holder
-     * 
-     * @return current place
-     */
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    /**
-     * Sets current place
-     * 
-     * @param currentPlace
-     */
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    /**
-     * Returns a signature of the header
-     * 
-     * @return itsp signature
-     */
-    public byte[] getSignature() {
-        return signature;
-    }
-
-    /**
-     * Sets itsp signature
-     * 
-     * @param signature
-     */
-    protected void setSignature(byte[] signature) {
-        this.signature = signature;
-    }
-
-    /**
-     * Returns version of itsp header
-     * 
-     * @return version
-     */
-    public int getVersion() {
-        return version;
-    }
-
-    /**
-     * Sets a version of itsp header
-     * 
-     * @param version
-     */
-    protected void setVersion(int version) {
-        this.version = version;
-    }
-
-    /**
-     * Returns header length
-     * 
-     * @return header length
-     */
-    public int getHeader_len() {
-        return header_len;
-    }
-
-    /**
-     * Sets itsp header length
-     * 
-     * @param header_len
-     */
-    protected void setHeader_len(int header_len) {
-        this.header_len = header_len;
-    }
-
-    /**
-     * Returns 000c unknown bytes
-     */
-    public int getUnknown_000c() {
-        return unknown_000c;
-    }
-
-    /**
-     * Sets 000c unknown bytes Unknown means here that those guys who cracked
-     * the chm format do not know what's it purposes for
-     * 
-     * @param unknown_000c
-     */
-    protected void setUnknown_000c(int unknown_000c) {
-        this.unknown_000c = unknown_000c;
-    }
-
-    /**
-     * Returns block's length
-     * 
-     * @return block_length
-     */
-    public long getBlock_len() {
-        return block_len;
-    }
-
-    /**
-     * Sets block length
-     * 
-     * @param block_len
-     */
-    protected void setBlock_len(long block_len) {
-        this.block_len = block_len;
-    }
-
-    /**
-     * Returns block index interval
-     * 
-     * @return blockidx_intvl
-     */
-    public int getBlockidx_intvl() {
-        return blockidx_intvl;
-    }
-
-    /**
-     * Sets block index interval
-     * 
-     * @param blockidx_intvl
-     */
-    protected void setBlockidx_intvl(int blockidx_intvl) {
-        this.blockidx_intvl = blockidx_intvl;
-    }
-
-    /**
-     * Returns an index depth
-     * 
-     * @return index_depth
-     */
-    public int getIndex_depth() {
-        return index_depth;
-    }
-
-    /**
-     * Sets an index depth
-     * 
-     * @param index_depth
-     */
-    protected void setIndex_depth(int index_depth) {
-        this.index_depth = index_depth;
-    }
-
-    /**
-     * Returns index root
-     * 
-     * @return index_root
-     */
-    public int getIndex_root() {
-        return index_root;
-    }
-
-    /**
-     * Sets an index root
-     * 
-     * @param index_root
-     */
-    protected void setIndex_root(int index_root) {
-        this.index_root = index_root;
-    }
-
-    /**
-     * Returns an index head
-     * 
-     * @return index_head
-     */
-    public int getIndex_head() {
-        return index_head;
-    }
-
-    /**
-     * Sets an index head
-     * 
-     * @param index_head
-     */
-    protected void setIndex_head(int index_head) {
-        this.index_head = index_head;
-    }
-
-    /**
-     * Returns 0024 unknown bytes
-     * 
-     * @return unknown_0024
-     */
-    public int getUnknown_0024() {
-        return unknown_0024;
-    }
-
-    /**
-     * Sets 0024 unknown bytes
-     * 
-     * @param unknown_0024
-     */
-    protected void setUnknown_0024(int unknown_0024) {
-        this.unknown_0024 = unknown_0024;
-    }
-
-    /**
-     * Returns number of blocks
-     * 
-     * @return num_blocks
-     */
-    public long getNum_blocks() {
-        return num_blocks;
-    }
-
-    /**
-     * Sets number of blocks containing in the chm file
-     * 
-     * @param num_blocks
-     */
-    protected void setNum_blocks(long num_blocks) {
-        this.num_blocks = num_blocks;
-    }
-
-    /**
-     * Returns 002c unknown bytes
-     * 
-     * @return unknown_002c
-     */
-    public int getUnknown_002c() {
-        return unknown_002c;
-    }
-
-    /**
-     * Sets 002c unknown bytes
-     * 
-     * @param unknown_002c
-     */
-    protected void setUnknown_002c(int unknown_002c) {
-        this.unknown_002c = unknown_002c;
-    }
-
-    /**
-     * Returns language id
-     * 
-     * @return lang_id
-     */
-    public long getLang_id() {
-        return lang_id;
-    }
-
-    /**
-     * Sets language id
-     * 
-     * @param lang_id
-     */
-    protected void setLang_id(long lang_id) {
-        this.lang_id = lang_id;
-    }
-
-    /**
-     * Returns system uuid
-     * 
-     * @return system_uuid
-     */
-    public byte[] getSystem_uuid() {
-        return system_uuid;
-    }
-
-    /**
-     * Sets system uuid
-     * 
-     * @param system_uuid
-     */
-    protected void setSystem_uuid(byte[] system_uuid) {
-        this.system_uuid = system_uuid;
-    }
-
-    /**
-     * Returns 0044 unknown bytes
-     * 
-     * @return unknown_0044
-     */
-    public byte[] getUnknown_0044() {
-        return unknown_0044;
-    }
-
-    /**
-     * Sets 0044 unknown bytes
-     * 
-     * @param unknown_0044
-     */
-    protected void setUnknown_0044(byte[] unknown_0044) {
-        this.unknown_0044 = unknown_0044;
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
-        /* we only know how to deal with the 0x58 and 0x60 byte structures */
-        if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
-            throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures");
-
-        /* unmarshal common fields */
-        chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
-        // ChmCommons.unmarshalCharArray(data, chmItspHeader,
-        // ChmConstants.CHM_SIGNATURE_LEN);
-        chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
-                chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
-        chmItspHeader
-                .setHeader_len(chmItspHeader.unmarshalInt32(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getHeader_len()));
-        chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
-                chmItspHeader.getDataRemained(),
-                chmItspHeader.getUnknown_000c()));
-        chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
-                chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
-        chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
-                chmItspHeader.getDataRemained(),
-                chmItspHeader.getBlockidx_intvl()));
-        chmItspHeader
-                .setIndex_depth(chmItspHeader.unmarshalInt32(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getIndex_depth()));
-        chmItspHeader
-                .setIndex_root(chmItspHeader.unmarshalInt32(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getIndex_root()));
-        chmItspHeader
-                .setIndex_head(chmItspHeader.unmarshalInt32(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getIndex_head()));
-        chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
-                chmItspHeader.getDataRemained(),
-                chmItspHeader.getUnknown_0024()));
-        chmItspHeader
-                .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getNum_blocks()));
-        chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
-                chmItspHeader.getDataRemained(),
-                chmItspHeader.getUnknown_002c())));
-        chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
-                chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
-        chmItspHeader
-                .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getSystem_uuid(),
-                        ChmConstants.BYTE_ARRAY_LENGHT));
-        chmItspHeader
-                .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
-                        chmItspHeader.getDataRemained(),
-                        chmItspHeader.getUnknown_0044(),
-                        ChmConstants.BYTE_ARRAY_LENGHT));
-
-        /* Checks validity of the itsp header */
-        if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP))
-                throw new ChmParsingException("seems not valid signature");
-
-        if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
-            throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
-
-        if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
-            throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Directory header The directory starts with a header; its format is as
+ * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
+ * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
+ * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
+ * Depth of the index tree - 1 there is no index, 2 if there is one level of
+ * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
+ * (though at least one file has 0 despite there being no index chunk, probably
+ * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
+ * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
+ * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
+ * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
+ * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
+ * DWORD -1 (unknown)
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ * 
+ */
+public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
+    // TODO: refactor all unmarshals
+    private static final long serialVersionUID = 1962394421998181341L;
+    private byte[] signature;
+    private int version; /* 4 */
+    private int header_len; /* 8 */
+    private int unknown_000c; /* c */
+    private long block_len; /* 10 */
+    private int blockidx_intvl; /* 14 */
+    private int index_depth; /* 18 */
+    private int index_root; /* 1c */
+    private int index_head; /* 20 */
+    private int unknown_0024; /* 24 */
+    private long num_blocks; /* 28 */
+    private int unknown_002c; /* 2c */
+    private long lang_id; /* 30 */
+    private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
+    private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    public ChmItspHeader() {
+        signature = ChmConstants.ITSP.getBytes(UTF_8); /*
+                                                        * 0
+                                                        * (ITSP
+                                                        * )
+                                                        */
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("[ signature:=" + new String(getSignature(), UTF_8)
+                + System.getProperty("line.separator"));
+        sb.append("version:=\t" + getVersion()
+                + System.getProperty("line.separator"));
+        sb.append("header_len:=\t" + getHeader_len()
+                + System.getProperty("line.separator"));
+        sb.append("unknown_00c:=\t" + getUnknown_000c()
+                + System.getProperty("line.separator"));
+        sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
+                + System.getProperty("line.separator"));
+        sb.append("blockidx_intvl:=" + getBlockidx_intvl()
+                + ", density of quickref section, usually 2"
+                + System.getProperty("line.separator"));
+        sb.append("index_depth:=\t"
+                + getIndex_depth()
+                + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
+                + System.getProperty("line.separator"));
+        sb.append("index_root:=\t" + getIndex_root()
+                + ", chunk number of root index chunk, -1 if there is none"
+                + System.getProperty("line.separator"));
+        sb.append("index_head:=\t" + getIndex_head()
+                + ", chunk number of first PMGL (listing) chunk"
+                + System.getProperty("line.separator"));
+        sb.append("unknown_0024:=\t" + getUnknown_0024()
+                + ", chunk number of last PMGL (listing) chunk"
+                + System.getProperty("line.separator"));
+        sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
+                + System.getProperty("line.separator"));
+        sb.append("unknown_002c:=\t" + getUnknown_002c()
+                + ", number of directory chunks (total)"
+                + System.getProperty("line.separator"));
+        sb.append("lang_id:=\t" + getLang_id() + " - "
+                + ChmCommons.getLanguage(getLang_id())
+                + System.getProperty("line.separator"));
+        sb.append("system_uuid:=" + getSystem_uuid()
+                + System.getProperty("line.separator"));
+        sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
+        return sb.toString();
+    }
+
+    /**
+     * Copies 4 bits from data[]
+     * 
+     * @param data
+     * @param chmItspHeader
+     * @param count
+     * @throws TikaException 
+     */
+    private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
+            int count) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        ChmAssert.assertChmAccessorNotNull(chmItspHeader);
+        this.setDataRemained(data.length);
+        System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+    }
+
+    private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        if (4 > this.getDataRemained())
+            throw new TikaException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        this.setDataRemained(this.getDataRemained() - 4);
+        return dest;
+    }
+
+    private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        if (4 > dataLenght)
+            throw new TikaException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
+            int count) {
+        System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+        return dest;
+    }
+
+    /**
+     * Returns how many bytes remained
+     * 
+     * @return int
+     */
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    /**
+     * Sets how many bytes remained
+     * 
+     * @param dataRemained
+     */
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    /**
+     * Returns a place holder
+     * 
+     * @return current place
+     */
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    /**
+     * Sets current place
+     * 
+     * @param currentPlace
+     */
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    /**
+     * Returns a signature of the header
+     * 
+     * @return itsp signature
+     */
+    public byte[] getSignature() {
+        return signature;
+    }
+
+    /**
+     * Sets itsp signature
+     * 
+     * @param signature
+     */
+    protected void setSignature(byte[] signature) {
+        this.signature = signature;
+    }
+
+    /**
+     * Returns version of itsp header
+     * 
+     * @return version
+     */
+    public int getVersion() {
+        return version;
+    }
+
+    /**
+     * Sets a version of itsp header
+     * 
+     * @param version
+     */
+    protected void setVersion(int version) {
+        this.version = version;
+    }
+
+    /**
+     * Returns header length
+     * 
+     * @return header length
+     */
+    public int getHeader_len() {
+        return header_len;
+    }
+
+    /**
+     * Sets itsp header length
+     * 
+     * @param header_len
+     */
+    protected void setHeader_len(int header_len) {
+        this.header_len = header_len;
+    }
+
+    /**
+     * Returns 000c unknown bytes
+     */
+    public int getUnknown_000c() {
+        return unknown_000c;
+    }
+
+    /**
+     * Sets 000c unknown bytes Unknown means here that those guys who cracked
+     * the chm format do not know what's it purposes for
+     * 
+     * @param unknown_000c
+     */
+    protected void setUnknown_000c(int unknown_000c) {
+        this.unknown_000c = unknown_000c;
+    }
+
+    /**
+     * Returns block's length
+     * 
+     * @return block_length
+     */
+    public long getBlock_len() {
+        return block_len;
+    }
+
+    /**
+     * Sets block length
+     * 
+     * @param block_len
+     */
+    protected void setBlock_len(long block_len) {
+        this.block_len = block_len;
+    }
+
+    /**
+     * Returns block index interval
+     * 
+     * @return blockidx_intvl
+     */
+    public int getBlockidx_intvl() {
+        return blockidx_intvl;
+    }
+
+    /**
+     * Sets block index interval
+     * 
+     * @param blockidx_intvl
+     */
+    protected void setBlockidx_intvl(int blockidx_intvl) {
+        this.blockidx_intvl = blockidx_intvl;
+    }
+
+    /**
+     * Returns an index depth
+     * 
+     * @return index_depth
+     */
+    public int getIndex_depth() {
+        return index_depth;
+    }
+
+    /**
+     * Sets an index depth
+     * 
+     * @param index_depth
+     */
+    protected void setIndex_depth(int index_depth) {
+        this.index_depth = index_depth;
+    }
+
+    /**
+     * Returns index root
+     * 
+     * @return index_root
+     */
+    public int getIndex_root() {
+        return index_root;
+    }
+
+    /**
+     * Sets an index root
+     * 
+     * @param index_root
+     */
+    protected void setIndex_root(int index_root) {
+        this.index_root = index_root;
+    }
+
+    /**
+     * Returns an index head
+     * 
+     * @return index_head
+     */
+    public int getIndex_head() {
+        return index_head;
+    }
+
+    /**
+     * Sets an index head
+     * 
+     * @param index_head
+     */
+    protected void setIndex_head(int index_head) {
+        this.index_head = index_head;
+    }
+
+    /**
+     * Returns 0024 unknown bytes
+     * 
+     * @return unknown_0024
+     */
+    public int getUnknown_0024() {
+        return unknown_0024;
+    }
+
+    /**
+     * Sets 0024 unknown bytes
+     * 
+     * @param unknown_0024
+     */
+    protected void setUnknown_0024(int unknown_0024) {
+        this.unknown_0024 = unknown_0024;
+    }
+
+    /**
+     * Returns number of blocks
+     * 
+     * @return num_blocks
+     */
+    public long getNum_blocks() {
+        return num_blocks;
+    }
+
+    /**
+     * Sets number of blocks containing in the chm file
+     * 
+     * @param num_blocks
+     */
+    protected void setNum_blocks(long num_blocks) {
+        this.num_blocks = num_blocks;
+    }
+
+    /**
+     * Returns 002c unknown bytes
+     * 
+     * @return unknown_002c
+     */
+    public int getUnknown_002c() {
+        return unknown_002c;
+    }
+
+    /**
+     * Sets 002c unknown bytes
+     * 
+     * @param unknown_002c
+     */
+    protected void setUnknown_002c(int unknown_002c) {
+        this.unknown_002c = unknown_002c;
+    }
+
+    /**
+     * Returns language id
+     * 
+     * @return lang_id
+     */
+    public long getLang_id() {
+        return lang_id;
+    }
+
+    /**
+     * Sets language id
+     * 
+     * @param lang_id
+     */
+    protected void setLang_id(long lang_id) {
+        this.lang_id = lang_id;
+    }
+
+    /**
+     * Returns system uuid
+     * 
+     * @return system_uuid
+     */
+    public byte[] getSystem_uuid() {
+        return system_uuid;
+    }
+
+    /**
+     * Sets system uuid
+     * 
+     * @param system_uuid
+     */
+    protected void setSystem_uuid(byte[] system_uuid) {
+        this.system_uuid = system_uuid;
+    }
+
+    /**
+     * Returns 0044 unknown bytes
+     * 
+     * @return unknown_0044
+     */
+    public byte[] getUnknown_0044() {
+        return unknown_0044;
+    }
+
+    /**
+     * Sets 0044 unknown bytes
+     * 
+     * @param unknown_0044
+     */
+    protected void setUnknown_0044(byte[] unknown_0044) {
+        this.unknown_0044 = unknown_0044;
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
+        /* we only know how to deal with the 0x58 and 0x60 byte structures */
+        if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
+            throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures");
+
+        /* unmarshal common fields */
+        chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
+        // ChmCommons.unmarshalCharArray(data, chmItspHeader,
+        // ChmConstants.CHM_SIGNATURE_LEN);
+        chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
+                chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
+        chmItspHeader
+                .setHeader_len(chmItspHeader.unmarshalInt32(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getHeader_len()));
+        chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
+                chmItspHeader.getDataRemained(),
+                chmItspHeader.getUnknown_000c()));
+        chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
+                chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
+        chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
+                chmItspHeader.getDataRemained(),
+                chmItspHeader.getBlockidx_intvl()));
+        chmItspHeader
+                .setIndex_depth(chmItspHeader.unmarshalInt32(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getIndex_depth()));
+        chmItspHeader
+                .setIndex_root(chmItspHeader.unmarshalInt32(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getIndex_root()));
+        chmItspHeader
+                .setIndex_head(chmItspHeader.unmarshalInt32(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getIndex_head()));
+        chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
+                chmItspHeader.getDataRemained(),
+                chmItspHeader.getUnknown_0024()));
+        chmItspHeader
+                .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getNum_blocks()));
+        chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
+                chmItspHeader.getDataRemained(),
+                chmItspHeader.getUnknown_002c())));
+        chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
+                chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
+        chmItspHeader
+                .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getSystem_uuid(),
+                        ChmConstants.BYTE_ARRAY_LENGHT));
+        chmItspHeader
+                .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
+                        chmItspHeader.getDataRemained(),
+                        chmItspHeader.getUnknown_0044(),
+                        ChmConstants.BYTE_ARRAY_LENGHT));
+
+        /* Checks validity of the itsp header */
+        if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP))
+                throw new ChmParsingException("seems not valid signature");
+
+        if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
+            throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
+
+        if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
+            throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
index 6054695..17a2e2f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
@@ -1,319 +1,319 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * 
- * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
- * information on the compression. The information is partially known: 0000:
- * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
- * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
- * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
- * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
- * 001C: DWORD 0 (unknown)
- * 
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?page=2 }
- * 
- */
-public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
-    private static final long serialVersionUID = -7897854774939631565L;
-    /* class' members */
-    private long size; /* 0 */
-    private byte[] signature;
-    private long version; /* 8 */
-    private long resetInterval; /* c */
-    private long windowSize; /* 10 */
-    private long windowsPerReset; /* 14 */
-    private long unknown_18; /* 18 */
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    public ChmLzxcControlData() {
-        signature = ChmConstants.LZXC.getBytes(UTF_8); /*
-                                                        * 4
-                                                        * (LZXC
-                                                        * )
-                                                        */
-    }
-
-    /**
-     * Returns a remained data
-     * 
-     * @return dataRemained
-     */
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    /**
-     * Sets a remained data
-     * 
-     * @param dataRemained
-     */
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    /**
-     * Returns a place holder
-     * 
-     * @return current_place
-     */
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    /**
-     * Sets a place holder
-     * 
-     * @param current_place
-     */
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    /**
-     * Returns a size of control data
-     * 
-     * @return size
-     */
-    public long getSize() {
-        return size;
-    }
-
-    /**
-     * Sets a size of control data
-     * 
-     * @param size
-     */
-    protected void setSize(long size) {
-        this.size = size;
-    }
-
-    /**
-     * Returns a signature of control data block
-     * 
-     * @return signature
-     */
-    public byte[] getSignature() {
-        return signature;
-    }
-
-    /**
-     * Sets a signature of control data block
-     * 
-     * @param signature
-     */
-    protected void setSignature(byte[] signature) {
-        this.signature = signature;
-    }
-
-    /**
-     * Returns a version of control data block
-     * 
-     * @return version
-     */
-    public long getVersion() {
-        return version;
-    }
-
-    /**
-     * Sets version of control data block
-     * 
-     * @param version
-     */
-    protected void setVersion(long version) {
-        this.version = version;
-    }
-
-    /**
-     * Returns reset interval
-     * 
-     * @return reset_interval
-     */
-    public long getResetInterval() {
-        return resetInterval;
-    }
-
-    /**
-     * Sets a reset interval
-     * 
-     * @param resetInterval
-     */
-    protected void setResetInterval(long resetInterval) {
-        this.resetInterval = resetInterval;
-    }
-
-    /**
-     * Returns a window size
-     * 
-     * @return window_size
-     */
-    public long getWindowSize() {
-        return windowSize;
-    }
-
-    /**
-     * Sets a window size
-     * 
-     * @param window_size
-     */
-    protected void setWindowSize(long windowSize) {
-        this.windowSize = windowSize;
-    }
-
-    /**
-     * Returns windows per reset
-     * 
-     * @return
-     */
-    public long getWindowsPerReset() {
-        return windowsPerReset;
-    }
-
-    /**
-     * Sets windows per reset
-     * 
-     * @param windows_per_reset
-     */
-    protected void setWindowsPerReset(long windowsPerReset) {
-        this.windowsPerReset = windowsPerReset;
-    }
-
-    /**
-     * Returns unknown 18 bytes
-     * 
-     * @return unknown_18
-     */
-    public long getUnknown_18() {
-        return unknown_18;
-    }
-
-    /**
-     * Sets unknown 18 bytes
-     * 
-     * @param unknown_18
-     */
-    protected void setUnknown_18(long unknown_18) {
-        this.unknown_18 = unknown_18;
-    }
-
-    private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
-        assert (data != null && data.length > 0);
-        if (4 > getDataRemained())
-            throw new ChmParsingException("4 > dataLenght");
-        dest = data[this.getCurrentPlace()]
-                | data[this.getCurrentPlace() + 1] << 8
-                | data[this.getCurrentPlace() + 2] << 16
-                | data[this.getCurrentPlace() + 3] << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    private void unmarshalCharArray(byte[] data,
-            ChmLzxcControlData chmLzxcControlData, int count) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
-        ChmAssert.assertPositiveInt(count);
-        System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-    }
-
-    /**
-     * Returns textual representation of ChmLzxcControlData
-     */
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("size(unknown):=" + this.getSize() + ", ");
-        sb.append("signature(Compression type identifier):="
-                + new String(this.getSignature(), UTF_8) + ", ");
-        sb.append("version(Possibly numeric code for LZX):="
-                + this.getVersion() + System.getProperty("line.separator"));
-        sb.append("resetInterval(The Huffman reset interval):="
-                + this.getResetInterval() + ", ");
-        sb.append("windowSize:=" + this.getWindowSize() + ", ");
-        sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
-                + this.getWindowsPerReset() + ", ");
-        sb.append("unknown_18:=" + this.getUnknown_18()
-                + System.getProperty("line.separator"));
-        return sb.toString();
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException {
-        if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
-            throw new ChmParsingException("we want at least 0x18 bytes");
-        chmLzxcControlData.setDataRemained(data.length);
-        chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize()));
-        chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
-                ChmConstants.CHM_SIGNATURE_LEN);
-        chmLzxcControlData.setVersion(unmarshalUInt32(data,
-                chmLzxcControlData.getVersion()));
-        chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
-                chmLzxcControlData.getResetInterval()));
-        chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
-                chmLzxcControlData.getWindowSize()));
-        chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
-                chmLzxcControlData.getWindowsPerReset()));
-
-        if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
-            chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
-                    chmLzxcControlData.getUnknown_18()));
-        else
-            chmLzxcControlData.setUnknown_18(0);
-
-        if (chmLzxcControlData.getVersion() == 2) {
-            chmLzxcControlData.setWindowSize(getWindowSize()
-                    * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
-        }
-
-        if (chmLzxcControlData.getWindowSize() == 0
-                || chmLzxcControlData.getResetInterval() == 0)
-            throw new ChmParsingException(
-                    "window size / resetInterval should be more than zero");
-
-        if (chmLzxcControlData.getWindowSize() == 1)
-            throw new ChmParsingException(
-                    "window size / resetInterval should be more than 1");
-
-        /* checks a signature */
-        if (!new String(chmLzxcControlData.getSignature(), UTF_8)
-                .equals(ChmConstants.LZXC))
-            throw new ChmParsingException(
-                    "the signature does not seem to be correct");
-    }
-
-    /**
-     * @param args
-     */
-    public static void main(String[] args) {
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * 
+ * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
+ * information on the compression. The information is partially known: 0000:
+ * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
+ * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
+ * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
+ * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
+ * 001C: DWORD 0 (unknown)
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ * 
+ */
+public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
+    private static final long serialVersionUID = -7897854774939631565L;
+    /* class' members */
+    private long size; /* 0 */
+    private byte[] signature;
+    private long version; /* 8 */
+    private long resetInterval; /* c */
+    private long windowSize; /* 10 */
+    private long windowsPerReset; /* 14 */
+    private long unknown_18; /* 18 */
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    public ChmLzxcControlData() {
+        signature = ChmConstants.LZXC.getBytes(UTF_8); /*
+                                                        * 4
+                                                        * (LZXC
+                                                        * )
+                                                        */
+    }
+
+    /**
+     * Returns a remained data
+     * 
+     * @return dataRemained
+     */
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    /**
+     * Sets a remained data
+     * 
+     * @param dataRemained
+     */
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    /**
+     * Returns a place holder
+     * 
+     * @return current_place
+     */
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    /**
+     * Sets a place holder
+     * 
+     * @param current_place
+     */
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    /**
+     * Returns a size of control data
+     * 
+     * @return size
+     */
+    public long getSize() {
+        return size;
+    }
+
+    /**
+     * Sets a size of control data
+     * 
+     * @param size
+     */
+    protected void setSize(long size) {
+        this.size = size;
+    }
+
+    /**
+     * Returns a signature of control data block
+     * 
+     * @return signature
+     */
+    public byte[] getSignature() {
+        return signature;
+    }
+
+    /**
+     * Sets a signature of control data block
+     * 
+     * @param signature
+     */
+    protected void setSignature(byte[] signature) {
+        this.signature = signature;
+    }
+
+    /**
+     * Returns a version of control data block
+     * 
+     * @return version
+     */
+    public long getVersion() {
+        return version;
+    }
+
+    /**
+     * Sets version of control data block
+     * 
+     * @param version
+     */
+    protected void setVersion(long version) {
+        this.version = version;
+    }
+
+    /**
+     * Returns reset interval
+     * 
+     * @return reset_interval
+     */
+    public long getResetInterval() {
+        return resetInterval;
+    }
+
+    /**
+     * Sets a reset interval
+     * 
+     * @param resetInterval
+     */
+    protected void setResetInterval(long resetInterval) {
+        this.resetInterval = resetInterval;
+    }
+
+    /**
+     * Returns a window size
+     * 
+     * @return window_size
+     */
+    public long getWindowSize() {
+        return windowSize;
+    }
+
+    /**
+     * Sets a window size
+     * 
+     * @param window_size
+     */
+    protected void setWindowSize(long windowSize) {
+        this.windowSize = windowSize;
+    }
+
+    /**
+     * Returns windows per reset
+     * 
+     * @return
+     */
+    public long getWindowsPerReset() {
+        return windowsPerReset;
+    }
+
+    /**
+     * Sets windows per reset
+     * 
+     * @param windows_per_reset
+     */
+    protected void setWindowsPerReset(long windowsPerReset) {
+        this.windowsPerReset = windowsPerReset;
+    }
+
+    /**
+     * Returns unknown 18 bytes
+     * 
+     * @return unknown_18
+     */
+    public long getUnknown_18() {
+        return unknown_18;
+    }
+
+    /**
+     * Sets unknown 18 bytes
+     * 
+     * @param unknown_18
+     */
+    protected void setUnknown_18(long unknown_18) {
+        this.unknown_18 = unknown_18;
+    }
+
+    private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
+        assert (data != null && data.length > 0);
+        if (4 > getDataRemained())
+            throw new ChmParsingException("4 > dataLenght");
+        dest = data[this.getCurrentPlace()]
+                | data[this.getCurrentPlace() + 1] << 8
+                | data[this.getCurrentPlace() + 2] << 16
+                | data[this.getCurrentPlace() + 3] << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    private void unmarshalCharArray(byte[] data,
+            ChmLzxcControlData chmLzxcControlData, int count) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
+        ChmAssert.assertPositiveInt(count);
+        System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+    }
+
+    /**
+     * Returns textual representation of ChmLzxcControlData
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("size(unknown):=" + this.getSize() + ", ");
+        sb.append("signature(Compression type identifier):="
+                + new String(this.getSignature(), UTF_8) + ", ");
+        sb.append("version(Possibly numeric code for LZX):="
+                + this.getVersion() + System.getProperty("line.separator"));
+        sb.append("resetInterval(The Huffman reset interval):="
+                + this.getResetInterval() + ", ");
+        sb.append("windowSize:=" + this.getWindowSize() + ", ");
+        sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
+                + this.getWindowsPerReset() + ", ");
+        sb.append("unknown_18:=" + this.getUnknown_18()
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException {
+        if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
+            throw new ChmParsingException("we want at least 0x18 bytes");
+        chmLzxcControlData.setDataRemained(data.length);
+        chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize()));
+        chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
+                ChmConstants.CHM_SIGNATURE_LEN);
+        chmLzxcControlData.setVersion(unmarshalUInt32(data,
+                chmLzxcControlData.getVersion()));
+        chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
+                chmLzxcControlData.getResetInterval()));
+        chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
+                chmLzxcControlData.getWindowSize()));
+        chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
+                chmLzxcControlData.getWindowsPerReset()));
+
+        if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
+            chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
+                    chmLzxcControlData.getUnknown_18()));
+        else
+            chmLzxcControlData.setUnknown_18(0);
+
+        if (chmLzxcControlData.getVersion() == 2) {
+            chmLzxcControlData.setWindowSize(getWindowSize()
+                    * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
+        }
+
+        if (chmLzxcControlData.getWindowSize() == 0
+                || chmLzxcControlData.getResetInterval() == 0)
+            throw new ChmParsingException(
+                    "window size / resetInterval should be more than zero");
+
+        if (chmLzxcControlData.getWindowSize() == 1)
+            throw new ChmParsingException(
+                    "window size / resetInterval should be more than 1");
+
+        /* checks a signature */
+        if (!new String(chmLzxcControlData.getSignature(), UTF_8)
+                .equals(ChmConstants.LZXC))
+            throw new ChmParsingException(
+                    "the signature does not seem to be correct");
+    }
+
+    /**
+     * @param args
+     */
+    public static void main(String[] args) {
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
index d6b5328..5823f67 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
@@ -1,341 +1,341 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * LZXC reset table For ensuring a decompression. Reads the block named
- * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
- * .
- * 
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?page=2 }
- * 
- */
-public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
-    private static final long serialVersionUID = -8209574429411707460L;
-    /* class members */
-    private long version; // 0000: DWORD 2 unknown (possibly a version number)
-    private long block_count; // 0004: DWORD Number of entries in reset table
-    private long unknown; // 0008: DWORD 8 unknown
-    private long table_offset; // 000C: DWORD $28 Length of table header (area
-                               // before table entries)
-    private long uncompressed_len; // 0010: QWORD Uncompressed Length
-    private long compressed_len; // 0018: QWORD Compressed Length
-    private long block_len; // 0020: QWORD 0x8000 block size for locations below
-    private long[] block_address;
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    /**
-     * Returns block addresses
-     * 
-     * @return block addresses
-     */
-    public long[] getBlockAddress() {
-        return block_address;
-    }
-
-    /**
-     * Sets block addresses
-     * 
-     * @param block_address
-     */
-    public void setBlockAddress(long[] block_address) {
-        this.block_address = block_address;
-    }
-
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    @Override
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("version:=" + getVersion()
-                + System.getProperty("line.separator"));
-        sb.append("block_count:=" + getBlockCount()
-                + System.getProperty("line.separator"));
-        sb.append("unknown:=" + getUnknown()
-                + System.getProperty("line.separator"));
-        sb.append("table_offset:=" + getTableOffset()
-                + System.getProperty("line.separator"));
-        sb.append("uncompressed_len:=" + getUncompressedLen()
-                + System.getProperty("line.separator"));
-        sb.append("compressed_len:=" + getCompressedLen()
-                + System.getProperty("line.separator"));
-        sb.append("block_len:=" + getBlockLen()
-                + System.getProperty("line.separator"));
-        sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
-        return sb.toString();
-    }
-
-    /**
-     * Enumerates chm block addresses
-     * 
-     * @param data
-     * 
-     * @return byte[] of addresses
-     * @throws TikaException 
-     */
-    private long[] enumerateBlockAddresses(byte[] data) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        /* we have limit of number of blocks to be extracted */
-        if (getBlockCount() > 5000)
-            setBlockCount(5000);
-
-        if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
-            setBlockCount(getDataRemained() / 8);
-
-        long[] addresses = new long[(int) getBlockCount()];
-        int rem = getDataRemained() / 8;
-        for (int i = 0; i < rem; i++) {
-            long num = -1;
-
-            try {
-                addresses[i] = unmarshalUint64(data, num);
-            } catch (Exception e) {
-                throw new TikaException(e.getMessage());
-            }
-        }
-        return addresses;
-    }
-
-    /**
-     * Validates parameters such as byte[] and chm lzxc reset table
-     * 
-     * @param data
-     * @param chmLzxcResetTable
-     * 
-     * @return boolean
-     * @throws TikaException 
-     */
-    private boolean validateParamaters(byte[] data,
-            ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
-        int goodParameter = 0;
-        ChmAssert.assertByteArrayNotNull(data);
-        ++goodParameter;
-        ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
-        ++goodParameter;
-        return (goodParameter == 2);
-    }
-
-    private long unmarshalUInt32(byte[] data, long dest) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    private long unmarshalUint64(byte[] data, long dest) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        byte[] temp = new byte[8];
-        int i, j;// counters
-
-        for (i = 8, j = 7; i > 0; i--) {
-            if (data.length > this.getCurrentPlace()) {
-                temp[j--] = data[this.getCurrentPlace()];
-                this.setCurrentPlace(this.getCurrentPlace() + 1);
-            } else
-                throw new TikaException("data is too small to calculate address block");
-        }
-        dest = new BigInteger(temp).longValue();
-        this.setDataRemained(this.getDataRemained() - 8);
-        return dest;
-    }
-
-    /**
-     * Returns the version
-     * 
-     * @return - long
-     */
-    public long getVersion() {
-        return version;
-    }
-
-    /**
-     * Sets the version
-     * 
-     * @param version
-     *            - long
-     */
-    public void setVersion(long version) {
-        this.version = version;
-    }
-
-    /**
-     * Gets a block count
-     * 
-     * @return - int
-     */
-    public long getBlockCount() {
-        return block_count;
-    }
-
-    /**
-     * Sets a block count
-     * 
-     * @param block_count
-     *            - long
-     */
-    public void setBlockCount(long block_count) {
-        this.block_count = block_count;
-    }
-
-    /**
-     * Gets unknown
-     * 
-     * @return - long
-     */
-    public long getUnknown() {
-        return unknown;
-    }
-
-    /**
-     * Sets an unknown
-     * 
-     * @param unknown
-     *            - long
-     */
-    public void setUnknown(long unknown) {
-        this.unknown = unknown;
-    }
-
-    /**
-     * Gets a table offset
-     * 
-     * @return - long
-     */
-    public long getTableOffset() {
-        return table_offset;
-    }
-
-    /**
-     * Sets a table offset
-     * 
-     * @param table_offset
-     *            - long
-     */
-    public void setTableOffset(long table_offset) {
-        this.table_offset = table_offset;
-    }
-
-    /**
-     * Gets uncompressed length
-     * 
-     * @return - {@link BigInteger }
-     */
-    public long getUncompressedLen() {
-        return uncompressed_len;
-    }
-
-    /**
-     * Sets uncompressed length
-     * 
-     * @param uncompressed_len
-     *            - {@link BigInteger}
-     */
-    public void setUncompressedLen(long uncompressed_len) {
-        this.uncompressed_len = uncompressed_len;
-    }
-
-    /**
-     * Gets compressed length
-     * 
-     * @return - {@link BigInteger}
-     */
-    public long getCompressedLen() {
-        return compressed_len;
-    }
-
-    /**
-     * Sets compressed length
-     * 
-     * @param compressed_len
-     *            - {@link BigInteger}
-     */
-    public void setCompressedLen(long compressed_len) {
-        this.compressed_len = compressed_len;
-    }
-
-    /**
-     * Gets a block length
-     * 
-     * @return - {@link BigInteger}
-     */
-    public long getBlockLen() {
-        return block_len;
-    }
-
-    /**
-     * Sets a block length
-     * 
-     * @param block_len
-     *            - {@link BigInteger}
-     */
-    public void setBlockLlen(long block_len) {
-        this.block_len = block_len;
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
-        setDataRemained(data.length);
-        if (validateParamaters(data, chmLzxcResetTable)) {
-            /* unmarshal fields */
-            chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion()));
-            chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount()));
-            chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown()));
-            chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset()));
-            chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen()));
-            chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen()));
-            chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen()));
-            chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
-        }
-
-        /* checks chmLzxcResetTable */
-        if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
-            throw new ChmParsingException(
-                    "does not seem currect version of chmLzxcResetTable");
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * LZXC reset table For ensuring a decompression. Reads the block named
+ * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
+ * .
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ * 
+ */
+public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
+    private static final long serialVersionUID = -8209574429411707460L;
+    /* class members */
+    private long version; // 0000: DWORD 2 unknown (possibly a version number)
+    private long block_count; // 0004: DWORD Number of entries in reset table
+    private long unknown; // 0008: DWORD 8 unknown
+    private long table_offset; // 000C: DWORD $28 Length of table header (area
+                               // before table entries)
+    private long uncompressed_len; // 0010: QWORD Uncompressed Length
+    private long compressed_len; // 0018: QWORD Compressed Length
+    private long block_len; // 0020: QWORD 0x8000 block size for locations below
+    private long[] block_address;
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    /**
+     * Returns block addresses
+     * 
+     * @return block addresses
+     */
+    public long[] getBlockAddress() {
+        return block_address;
+    }
+
+    /**
+     * Sets block addresses
+     * 
+     * @param block_address
+     */
+    public void setBlockAddress(long[] block_address) {
+        this.block_address = block_address;
+    }
+
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("version:=" + getVersion()
+                + System.getProperty("line.separator"));
+        sb.append("block_count:=" + getBlockCount()
+                + System.getProperty("line.separator"));
+        sb.append("unknown:=" + getUnknown()
+                + System.getProperty("line.separator"));
+        sb.append("table_offset:=" + getTableOffset()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed_len:=" + getUncompressedLen()
+                + System.getProperty("line.separator"));
+        sb.append("compressed_len:=" + getCompressedLen()
+                + System.getProperty("line.separator"));
+        sb.append("block_len:=" + getBlockLen()
+                + System.getProperty("line.separator"));
+        sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
+        return sb.toString();
+    }
+
+    /**
+     * Enumerates chm block addresses
+     * 
+     * @param data
+     * 
+     * @return byte[] of addresses
+     * @throws TikaException 
+     */
+    private long[] enumerateBlockAddresses(byte[] data) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        /* we have limit of number of blocks to be extracted */
+        if (getBlockCount() > 5000)
+            setBlockCount(5000);
+
+        if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
+            setBlockCount(getDataRemained() / 8);
+
+        long[] addresses = new long[(int) getBlockCount()];
+        int rem = getDataRemained() / 8;
+        for (int i = 0; i < rem; i++) {
+            long num = -1;
+
+            try {
+                addresses[i] = unmarshalUint64(data, num);
+            } catch (Exception e) {
+                throw new TikaException(e.getMessage());
+            }
+        }
+        return addresses;
+    }
+
+    /**
+     * Validates parameters such as byte[] and chm lzxc reset table
+     * 
+     * @param data
+     * @param chmLzxcResetTable
+     * 
+     * @return boolean
+     * @throws TikaException 
+     */
+    private boolean validateParamaters(byte[] data,
+            ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
+        int goodParameter = 0;
+        ChmAssert.assertByteArrayNotNull(data);
+        ++goodParameter;
+        ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
+        ++goodParameter;
+        return (goodParameter == 2);
+    }
+
+    private long unmarshalUInt32(byte[] data, long dest) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    private long unmarshalUint64(byte[] data, long dest) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        byte[] temp = new byte[8];
+        int i, j;// counters
+
+        for (i = 8, j = 7; i > 0; i--) {
+            if (data.length > this.getCurrentPlace()) {
+                temp[j--] = data[this.getCurrentPlace()];
+                this.setCurrentPlace(this.getCurrentPlace() + 1);
+            } else
+                throw new TikaException("data is too small to calculate address block");
+        }
+        dest = new BigInteger(temp).longValue();
+        this.setDataRemained(this.getDataRemained() - 8);
+        return dest;
+    }
+
+    /**
+     * Returns the version
+     * 
+     * @return - long
+     */
+    public long getVersion() {
+        return version;
+    }
+
+    /**
+     * Sets the version
+     * 
+     * @param version
+     *            - long
+     */
+    public void setVersion(long version) {
+        this.version = version;
+    }
+
+    /**
+     * Gets a block count
+     * 
+     * @return - int
+     */
+    public long getBlockCount() {
+        return block_count;
+    }
+
+    /**
+     * Sets a block count
+     * 
+     * @param block_count
+     *            - long
+     */
+    public void setBlockCount(long block_count) {
+        this.block_count = block_count;
+    }
+
+    /**
+     * Gets unknown
+     * 
+     * @return - long
+     */
+    public long getUnknown() {
+        return unknown;
+    }
+
+    /**
+     * Sets an unknown
+     * 
+     * @param unknown
+     *            - long
+     */
+    public void setUnknown(long unknown) {
+        this.unknown = unknown;
+    }
+
+    /**
+     * Gets a table offset
+     * 
+     * @return - long
+     */
+    public long getTableOffset() {
+        return table_offset;
+    }
+
+    /**
+     * Sets a table offset
+     * 
+     * @param table_offset
+     *            - long
+     */
+    public void setTableOffset(long table_offset) {
+        this.table_offset = table_offset;
+    }
+
+    /**
+     * Gets uncompressed length
+     * 
+     * @return - {@link BigInteger }
+     */
+    public long getUncompressedLen() {
+        return uncompressed_len;
+    }
+
+    /**
+     * Sets uncompressed length
+     * 
+     * @param uncompressed_len
+     *            - {@link BigInteger}
+     */
+    public void setUncompressedLen(long uncompressed_len) {
+        this.uncompressed_len = uncompressed_len;
+    }
+
+    /**
+     * Gets compressed length
+     * 
+     * @return - {@link BigInteger}
+     */
+    public long getCompressedLen() {
+        return compressed_len;
+    }
+
+    /**
+     * Sets compressed length
+     * 
+     * @param compressed_len
+     *            - {@link BigInteger}
+     */
+    public void setCompressedLen(long compressed_len) {
+        this.compressed_len = compressed_len;
+    }
+
+    /**
+     * Gets a block length
+     * 
+     * @return - {@link BigInteger}
+     */
+    public long getBlockLen() {
+        return block_len;
+    }
+
+    /**
+     * Sets a block length
+     * 
+     * @param block_len
+     *            - {@link BigInteger}
+     */
+    public void setBlockLlen(long block_len) {
+        this.block_len = block_len;
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
+        setDataRemained(data.length);
+        if (validateParamaters(data, chmLzxcResetTable)) {
+            /* unmarshal fields */
+            chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion()));
+            chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount()));
+            chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown()));
+            chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset()));
+            chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen()));
+            chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen()));
+            chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen()));
+            chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
+        }
+
+        /* checks chmLzxcResetTable */
+        if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
+            throw new ChmParsingException(
+                    "does not seem currect version of chmLzxcResetTable");
+    }
+}

[04/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index f43fdc0..4d5cc46 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -1,347 +1,347 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-import java.util.Locale;
-
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.document.TextBlock;
-import de.l3s.boilerpipe.document.TextDocument;
-import de.l3s.boilerpipe.extractors.ArticleExtractor;
-import de.l3s.boilerpipe.extractors.DefaultExtractor;
-import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
- * library to automatically extract the main content from a web page.
- * <p/>
- * Use this as a {@link ContentHandler} object passed to
- * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
- */
-public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
-
-    /**
-     * The newline character that gets inserted after block elements.
-     */
-    private static final char[] NL = new char[]{'\n'};
-    private ContentHandler delegate;
-    private BoilerpipeExtractor extractor;
-    private boolean includeMarkup;
-    private boolean inHeader;
-    private boolean inFooter;
-    private int headerCharOffset;
-    private List<RecordedElement> elements;
-    private TextDocument td;
-    /**
-     * Creates a new boilerpipe-based content extractor, using the
-     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
-     *
-     * @param delegate The {@link ContentHandler} object
-     */
-    public BoilerpipeContentHandler(ContentHandler delegate) {
-        this(delegate, DefaultExtractor.INSTANCE);
-    }
-
-    /**
-     * Creates a content handler that writes XHTML body character events to
-     * the given writer.
-     *
-     * @param writer writer
-     */
-    public BoilerpipeContentHandler(Writer writer) {
-        this(new WriteOutContentHandler(writer));
-    }
-
-    /**
-     * Creates a new boilerpipe-based content extractor, using the given
-     * extraction rules. The extracted main content will be passed to the
-     * <delegate> content handler.
-     *
-     * @param delegate  The {@link ContentHandler} object
-     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
-     */
-    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
-        this.td = null;
-        this.delegate = delegate;
-        this.extractor = extractor;
-    }
-
-    public boolean isIncludeMarkup() {
-        return includeMarkup;
-    }
-
-    public void setIncludeMarkup(boolean includeMarkup) {
-        this.includeMarkup = includeMarkup;
-    }
-
-    /**
-     * Retrieves the built TextDocument
-     *
-     * @return TextDocument
-     */
-    public TextDocument getTextDocument() {
-        return td;
-    }
-
-    @Override
-    public void startDocument() throws SAXException {
-        super.startDocument();
-
-        delegate.startDocument();
-
-        inHeader = true;
-        inFooter = false;
-        headerCharOffset = 0;
-
-        if (includeMarkup) {
-            elements = new ArrayList<RecordedElement>();
-        }
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-        super.startPrefixMapping(prefix, uri);
-        delegate.startPrefixMapping(prefix, uri);
-    }
-
-    ;
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        super.startElement(uri, localName, qName, atts);
-
-        if (inHeader) {
-            delegate.startElement(uri, localName, qName, atts);
-        } else if (inFooter) {
-            // Do nothing
-        } else if (includeMarkup) {
-            elements.add(new RecordedElement(uri, localName, qName, atts));
-        } else {
-            // This happens for the <body> element, if we're not doing markup.
-            delegate.startElement(uri, localName, qName, atts);
-        }
-    }
-
-    ;
-
-    @Override
-    public void characters(char[] chars, int offset, int length) throws SAXException {
-        super.characters(chars, offset, length);
-
-        if (inHeader) {
-            delegate.characters(chars, offset, length);
-            headerCharOffset++;
-        } else if (inFooter) {
-            // Do nothing
-        } else if (includeMarkup) {
-            RecordedElement element = elements.get(elements.size() - 1);
-
-            char[] characters = new char[length];
-            System.arraycopy(chars, offset, characters, 0, length);
-            element.getCharacters().add(characters);
-        }
-    }
-
-    ;
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        super.endElement(uri, localName, qName);
-
-        if (inHeader) {
-            delegate.endElement(uri, localName, qName);
-            inHeader = !localName.equals("head");
-        } else if (inFooter) {
-            // Do nothing
-        } else if (localName.equals("body")) {
-            inFooter = true;
-        } else if (includeMarkup) {
-            // Add the end element, and the continuation from the previous element
-            elements.add(new RecordedElement(uri, localName, qName));
-            elements.add(new RecordedElement());
-        }
-    }
-
-    ;
-
-    @Override
-    public void endDocument() throws SAXException {
-        super.endDocument();
-
-        td = toTextDocument();
-        try {
-            extractor.process(td);
-        } catch (BoilerpipeProcessingException e) {
-            throw new SAXException(e);
-        }
-
-        Attributes emptyAttrs = new AttributesImpl();
-
-        // At this point we have all the information we need to either emit N paragraphs
-        // of plain text (if not including markup), or we have to replay our recorded elements
-        // and only emit character runs that passed the boilerpipe filters.
-        if (includeMarkup) {
-            BitSet validCharacterRuns = new BitSet();
-            for (TextBlock block : td.getTextBlocks()) {
-                if (block.isContent()) {
-                    BitSet bs = block.getContainedTextElements();
-                    if (bs != null) {
-                        validCharacterRuns.or(bs);
-                    }
-                }
-            }
-
-            // Now have bits set for all valid character runs. Replay our recorded elements,
-            // but only emit character runs flagged as valid.
-            int curCharsIndex = headerCharOffset;
-
-            for (RecordedElement element : elements) {
-                switch (element.getElementType()) {
-                    case START:
-                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
-                        // Fall through
-
-                    case CONTINUE:
-                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
-                        // we have to follow suit.
-                        for (char[] chars : element.getCharacters()) {
-                            curCharsIndex++;
-
-                            if (validCharacterRuns.get(curCharsIndex)) {
-                                delegate.characters(chars, 0, chars.length);
-
-                                // https://issues.apache.org/jira/browse/TIKA-961
-                                if (!Character.isWhitespace(chars[chars.length - 1])) {
-                                    // Only add whitespace for certain elements
-                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
-                                        delegate.ignorableWhitespace(NL, 0, NL.length);
-                                    }
-                                }
-                            }
-                        }
-                        break;
-
-                    case END:
-                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
-                        break;
-
-                    default:
-                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
-                }
-
-
-            }
-        } else {
-            for (TextBlock block : td.getTextBlocks()) {
-                if (block.isContent()) {
-                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
-                    char[] chars = block.getText().toCharArray();
-                    delegate.characters(chars, 0, chars.length);
-                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
-                    delegate.ignorableWhitespace(NL, 0, NL.length);
-                }
-            }
-        }
-
-        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
-        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
-
-        // We defer ending any prefix mapping until here, which is why we don't pass this
-        // through to the delegate in an overridden method.
-        delegate.endPrefixMapping("");
-
-        delegate.endDocument();
-    }
-
-    ;
-
-    private static class RecordedElement {
-        private String uri;
-        private String localName;
-        private String qName;
-        private Attributes attrs;
-        private List<char[]> characters;
-        private ElementType elementType;
-        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
-            this(uri, localName, qName, attrs, ElementType.START);
-        }
-
-        public RecordedElement(String uri, String localName, String qName) {
-            this(uri, localName, qName, null, ElementType.END);
-        }
-
-        public RecordedElement() {
-            this(null, null, null, null, ElementType.CONTINUE);
-        }
-
-        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
-            this.uri = uri;
-            this.localName = localName;
-            this.qName = qName;
-            this.attrs = attrs;
-            this.elementType = elementType;
-            this.characters = new ArrayList<char[]>();
-        }
-
-        @Override
-        public String toString() {
-            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
-        }
-
-        public String getUri() {
-            return uri;
-        }
-
-        public String getLocalName() {
-            return localName;
-        }
-
-        public String getQName() {
-            return qName;
-        }
-
-        public Attributes getAttrs() {
-            return attrs;
-        }
-
-        public List<char[]> getCharacters() {
-            return characters;
-        }
-
-        public RecordedElement.ElementType getElementType() {
-            return elementType;
-        }
-
-        public enum ElementType {
-            START,
-            END,
-            CONTINUE
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+    /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[]{'\n'};
+    private ContentHandler delegate;
+    private BoilerpipeExtractor extractor;
+    private boolean includeMarkup;
+    private boolean inHeader;
+    private boolean inFooter;
+    private int headerCharOffset;
+    private List<RecordedElement> elements;
+    private TextDocument td;
+    /**
+     * Creates a new boilerpipe-based content extractor, using the
+     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+     *
+     * @param delegate The {@link ContentHandler} object
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate) {
+        this(delegate, DefaultExtractor.INSTANCE);
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public BoilerpipeContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a new boilerpipe-based content extractor, using the given
+     * extraction rules. The extracted main content will be passed to the
+     * <delegate> content handler.
+     *
+     * @param delegate  The {@link ContentHandler} object
+     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+        this.td = null;
+        this.delegate = delegate;
+        this.extractor = extractor;
+    }
+
+    public boolean isIncludeMarkup() {
+        return includeMarkup;
+    }
+
+    public void setIncludeMarkup(boolean includeMarkup) {
+        this.includeMarkup = includeMarkup;
+    }
+
+    /**
+     * Retrieves the built TextDocument
+     *
+     * @return TextDocument
+     */
+    public TextDocument getTextDocument() {
+        return td;
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+
+        delegate.startDocument();
+
+        inHeader = true;
+        inFooter = false;
+        headerCharOffset = 0;
+
+        if (includeMarkup) {
+            elements = new ArrayList<RecordedElement>();
+        }
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        super.startPrefixMapping(prefix, uri);
+        delegate.startPrefixMapping(prefix, uri);
+    }
+
+    ;
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+
+        if (inHeader) {
+            delegate.startElement(uri, localName, qName, atts);
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            elements.add(new RecordedElement(uri, localName, qName, atts));
+        } else {
+            // This happens for the <body> element, if we're not doing markup.
+            delegate.startElement(uri, localName, qName, atts);
+        }
+    }
+
+    ;
+
+    @Override
+    public void characters(char[] chars, int offset, int length) throws SAXException {
+        super.characters(chars, offset, length);
+
+        if (inHeader) {
+            delegate.characters(chars, offset, length);
+            headerCharOffset++;
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            RecordedElement element = elements.get(elements.size() - 1);
+
+            char[] characters = new char[length];
+            System.arraycopy(chars, offset, characters, 0, length);
+            element.getCharacters().add(characters);
+        }
+    }
+
+    ;
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        super.endElement(uri, localName, qName);
+
+        if (inHeader) {
+            delegate.endElement(uri, localName, qName);
+            inHeader = !localName.equals("head");
+        } else if (inFooter) {
+            // Do nothing
+        } else if (localName.equals("body")) {
+            inFooter = true;
+        } else if (includeMarkup) {
+            // Add the end element, and the continuation from the previous element
+            elements.add(new RecordedElement(uri, localName, qName));
+            elements.add(new RecordedElement());
+        }
+    }
+
+    ;
+
+    @Override
+    public void endDocument() throws SAXException {
+        super.endDocument();
+
+        td = toTextDocument();
+        try {
+            extractor.process(td);
+        } catch (BoilerpipeProcessingException e) {
+            throw new SAXException(e);
+        }
+
+        Attributes emptyAttrs = new AttributesImpl();
+
+        // At this point we have all the information we need to either emit N paragraphs
+        // of plain text (if not including markup), or we have to replay our recorded elements
+        // and only emit character runs that passed the boilerpipe filters.
+        if (includeMarkup) {
+            BitSet validCharacterRuns = new BitSet();
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    BitSet bs = block.getContainedTextElements();
+                    if (bs != null) {
+                        validCharacterRuns.or(bs);
+                    }
+                }
+            }
+
+            // Now have bits set for all valid character runs. Replay our recorded elements,
+            // but only emit character runs flagged as valid.
+            int curCharsIndex = headerCharOffset;
+
+            for (RecordedElement element : elements) {
+                switch (element.getElementType()) {
+                    case START:
+                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+                        // Fall through
+
+                    case CONTINUE:
+                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+                        // we have to follow suit.
+                        for (char[] chars : element.getCharacters()) {
+                            curCharsIndex++;
+
+                            if (validCharacterRuns.get(curCharsIndex)) {
+                                delegate.characters(chars, 0, chars.length);
+
+                                // https://issues.apache.org/jira/browse/TIKA-961
+                                if (!Character.isWhitespace(chars[chars.length - 1])) {
+                                    // Only add whitespace for certain elements
+                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                        delegate.ignorableWhitespace(NL, 0, NL.length);
+                                    }
+                                }
+                            }
+                        }
+                        break;
+
+                    case END:
+                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+                        break;
+
+                    default:
+                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
+                }
+
+
+            }
+        } else {
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+                    char[] chars = block.getText().toCharArray();
+                    delegate.characters(chars, 0, chars.length);
+                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                }
+            }
+        }
+
+        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+        // We defer ending any prefix mapping until here, which is why we don't pass this
+        // through to the delegate in an overridden method.
+        delegate.endPrefixMapping("");
+
+        delegate.endDocument();
+    }
+
+    ;
+
+    private static class RecordedElement {
+        private String uri;
+        private String localName;
+        private String qName;
+        private Attributes attrs;
+        private List<char[]> characters;
+        private ElementType elementType;
+        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+            this(uri, localName, qName, attrs, ElementType.START);
+        }
+
+        public RecordedElement(String uri, String localName, String qName) {
+            this(uri, localName, qName, null, ElementType.END);
+        }
+
+        public RecordedElement() {
+            this(null, null, null, null, ElementType.CONTINUE);
+        }
+
+        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+            this.uri = uri;
+            this.localName = localName;
+            this.qName = qName;
+            this.attrs = attrs;
+            this.elementType = elementType;
+            this.characters = new ArrayList<char[]>();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+        }
+
+        public String getUri() {
+            return uri;
+        }
+
+        public String getLocalName() {
+            return localName;
+        }
+
+        public String getQName() {
+            return qName;
+        }
+
+        public Attributes getAttrs() {
+            return attrs;
+        }
+
+        public List<char[]> getCharacters() {
+            return characters;
+        }
+
+        public RecordedElement.ElementType getElementType() {
+            return elementType;
+        }
+
+        public enum ElementType {
+            START,
+            END,
+            CONTINUE
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
index 0cef05f..4217ac5 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
@@ -1,137 +1,137 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-@SuppressWarnings("serial")
-public class DefaultHtmlMapper implements HtmlMapper {
-
-    /**
-     * @since Apache Tika 0.8
-     */
-    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
-    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-    private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
-        put("H1", "h1");
-        put("H2", "h2");
-        put("H3", "h3");
-        put("H4", "h4");
-        put("H5", "h5");
-        put("H6", "h6");
-
-        put("P", "p");
-        put("PRE", "pre");
-        put("BLOCKQUOTE", "blockquote");
-        put("Q", "q");
-
-        put("UL", "ul");
-        put("OL", "ol");
-        put("MENU", "ul");
-        put("LI", "li");
-        put("DL", "dl");
-        put("DT", "dt");
-        put("DD", "dd");
-
-        put("TABLE", "table");
-        put("THEAD", "thead");
-        put("TBODY", "tbody");
-        put("TR", "tr");
-        put("TH", "th");
-        put("TD", "td");
-
-        put("ADDRESS", "address");
-
-        // TIKA-460 - add anchors
-        put("A", "a");
-
-        // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
-        put("MAP", "map");
-        put("AREA", "area");
-        put("IMG", "img");
-        put("FRAMESET", "frameset");
-        put("FRAME", "frame");
-        put("IFRAME", "iframe");
-        put("OBJECT", "object");
-        put("PARAM", "param");
-        put("INS", "ins");
-        put("DEL", "del");
-    }};
-    private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
-        add("STYLE");
-        add("SCRIPT");
-    }};
-    // For information on tags & attributes, see:
-    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
-    // http://www.w3schools.com/TAGS/
-    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
-        put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
-        put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
-        put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
-        put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
-        put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
-        put("map", attrSet("id", "class", "style", "title", "name"));
-        put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
-        put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
-                "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
-        put("param", attrSet("id", "name", "value", "valuetype", "type"));
-        put("blockquote", attrSet("cite"));
-        put("ins", attrSet("cite", "datetime"));
-        put("del", attrSet("cite", "datetime"));
-        put("q", attrSet("cite"));
-
-        // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
-    }};
-
-    private static Set<String> attrSet(String... attrs) {
-        Set<String> result = new HashSet<String>();
-        for (String attr : attrs) {
-            result.add(attr);
-        }
-        return result;
-    }
-
-    public String mapSafeElement(String name) {
-        return SAFE_ELEMENTS.get(name);
-    }
-
-    /**
-     * Normalizes an attribute name. Assumes that the element name
-     * is valid and normalized
-     */
-    public String mapSafeAttribute(String elementName, String attributeName) {
-        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
-        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
-            return attributeName;
-        } else {
-            return null;
-        }
-    }
-
-    public boolean isDiscardElement(String name) {
-        return DISCARDABLE_ELEMENTS.contains(name);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+    /**
+     * @since Apache Tika 0.8
+     */
+    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+    private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+        put("H1", "h1");
+        put("H2", "h2");
+        put("H3", "h3");
+        put("H4", "h4");
+        put("H5", "h5");
+        put("H6", "h6");
+
+        put("P", "p");
+        put("PRE", "pre");
+        put("BLOCKQUOTE", "blockquote");
+        put("Q", "q");
+
+        put("UL", "ul");
+        put("OL", "ol");
+        put("MENU", "ul");
+        put("LI", "li");
+        put("DL", "dl");
+        put("DT", "dt");
+        put("DD", "dd");
+
+        put("TABLE", "table");
+        put("THEAD", "thead");
+        put("TBODY", "tbody");
+        put("TR", "tr");
+        put("TH", "th");
+        put("TD", "td");
+
+        put("ADDRESS", "address");
+
+        // TIKA-460 - add anchors
+        put("A", "a");
+
+        // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+        put("MAP", "map");
+        put("AREA", "area");
+        put("IMG", "img");
+        put("FRAMESET", "frameset");
+        put("FRAME", "frame");
+        put("IFRAME", "iframe");
+        put("OBJECT", "object");
+        put("PARAM", "param");
+        put("INS", "ins");
+        put("DEL", "del");
+    }};
+    private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+        add("STYLE");
+        add("SCRIPT");
+    }};
+    // For information on tags & attributes, see:
+    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+    // http://www.w3schools.com/TAGS/
+    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+        put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+        put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+        put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+        put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+        put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+        put("map", attrSet("id", "class", "style", "title", "name"));
+        put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+        put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+                "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+        put("param", attrSet("id", "name", "value", "valuetype", "type"));
+        put("blockquote", attrSet("cite"));
+        put("ins", attrSet("cite", "datetime"));
+        put("del", attrSet("cite", "datetime"));
+        put("q", attrSet("cite"));
+
+        // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+    }};
+
+    private static Set<String> attrSet(String... attrs) {
+        Set<String> result = new HashSet<String>();
+        for (String attr : attrs) {
+            result.add(attr);
+        }
+        return result;
+    }
+
+    public String mapSafeElement(String name) {
+        return SAFE_ELEMENTS.get(name);
+    }
+
+    /**
+     * Normalizes an attribute name. Assumes that the element name
+     * is valid and normalized
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+            return attributeName;
+        } else {
+            return null;
+        }
+    }
+
+    public boolean isDiscardElement(String name) {
+        return DISCARDABLE_ELEMENTS.contains(name);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index c5bbc7a..d5dfaa6 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -1,309 +1,309 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-class HtmlHandler extends TextContentHandler {
-
-    // List of attributes that need to be resolved.
-    private static final Set<String> URI_ATTRIBUTES =
-            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
-    private static final Pattern ICBM =
-            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
-    private final HtmlMapper mapper;
-    private final XHTMLContentHandler xhtml;
-    private final Metadata metadata;
-    private final StringBuilder title = new StringBuilder();
-    private int bodyLevel = 0;
-    private int discardLevel = 0;
-    private int titleLevel = 0;
-    private boolean isTitleSetToMetadata = false;
-
-    private HtmlHandler(
-            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
-        super(xhtml);
-        this.mapper = mapper;
-        this.xhtml = xhtml;
-        this.metadata = metadata;
-
-        // Try to determine the default base URL, if one has not been given
-        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
-            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-            if (name != null) {
-                name = name.trim();
-                try {
-                    new URL(name); // test URL format
-                    metadata.set(Metadata.CONTENT_LOCATION, name);
-                } catch (MalformedURLException e) {
-                    // The resource name is not a valid URL, ignore it
-                }
-            }
-        }
-    }
-
-    public HtmlHandler(
-            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
-        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
-    }
-
-    @Override
-    public void startElement(
-            String uri, String local, String name, Attributes atts)
-            throws SAXException {
-        if ("TITLE".equals(name) || titleLevel > 0) {
-            titleLevel++;
-        }
-        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
-            bodyLevel++;
-        }
-        if (mapper.isDiscardElement(name) || discardLevel > 0) {
-            discardLevel++;
-        }
-
-        if (bodyLevel == 0 && discardLevel == 0) {
-            if ("META".equals(name) && atts.getValue("content") != null) {
-                // TIKA-478: For cases where we have either a name or
-                // "http-equiv", assume that XHTMLContentHandler will emit
-                // these in the <head>, thus passing them through safely.
-                if (atts.getValue("http-equiv") != null) {
-                    addHtmlMetadata(
-                            atts.getValue("http-equiv"),
-                            atts.getValue("content"));
-                } else if (atts.getValue("name") != null) {
-                    // Record the meta tag in the metadata
-                    addHtmlMetadata(
-                            atts.getValue("name"),
-                            atts.getValue("content"));
-                } else if (atts.getValue("property") != null) {
-                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
-                    metadata.add(
-                            atts.getValue("property"),
-                            atts.getValue("content"));
-                }
-            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
-                startElementWithSafeAttributes("base", atts);
-                xhtml.endElement("base");
-                metadata.set(
-                        Metadata.CONTENT_LOCATION,
-                        resolve(atts.getValue("href")));
-            } else if ("LINK".equals(name)) {
-                startElementWithSafeAttributes("link", atts);
-                xhtml.endElement("link");
-            }
-        }
-
-        if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = mapper.mapSafeElement(name);
-            if (safe != null) {
-                startElementWithSafeAttributes(safe, atts);
-            }
-        }
-
-        title.setLength(0);
-    }
-
-    /**
-     * Adds a metadata setting from the HTML <head/> to the Tika metadata
-     * object. The name and value are normalized where possible.
-     */
-    private void addHtmlMetadata(String name, String value) {
-        if (name == null || value == null) {
-            // ignore
-        } else if (name.equalsIgnoreCase("ICBM")) {
-            Matcher m = ICBM.matcher(value);
-            if (m.matches()) {
-                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
-                metadata.set(Metadata.LATITUDE, m.group(1));
-                metadata.set(Metadata.LONGITUDE, m.group(2));
-            } else {
-                metadata.set("ICBM", value);
-            }
-        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
-            //don't overwrite Metadata.CONTENT_TYPE!
-            MediaType type = MediaType.parse(value);
-            if (type != null) {
-                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
-            } else {
-                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
-            }
-        } else {
-            metadata.add(name, value);
-        }
-    }
-
-    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
-        if (atts.getLength() == 0) {
-            xhtml.startElement(name);
-            return;
-        }
-
-        boolean isObject = name.equals("object");
-        String codebase = null;
-        if (isObject) {
-            codebase = atts.getValue("", "codebase");
-            if (codebase != null) {
-                codebase = resolve(codebase);
-            } else {
-                codebase = metadata.get(Metadata.CONTENT_LOCATION);
-            }
-        }
-
-        AttributesImpl newAttributes = new AttributesImpl(atts);
-        for (int att = 0; att < newAttributes.getLength(); att++) {
-            String attrName = newAttributes.getLocalName(att);
-            String normAttrName = mapper.mapSafeAttribute(name, attrName);
-            if (normAttrName == null) {
-                newAttributes.removeAttribute(att);
-                att--;
-            } else {
-                // We have a remapped attribute name, so set it as it might have changed.
-                newAttributes.setLocalName(att, normAttrName);
-
-                // And resolve relative links. Eventually this should be pushed
-                // into the HtmlMapper code.
-                if (URI_ATTRIBUTES.contains(normAttrName)) {
-                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
-                } else if (isObject && "codebase".equals(normAttrName)) {
-                    newAttributes.setValue(att, codebase);
-                } else if (isObject
-                        && ("data".equals(normAttrName)
-                        || "classid".equals(normAttrName))) {
-                    newAttributes.setValue(
-                            att,
-                            resolve(codebase, newAttributes.getValue(att)));
-                }
-            }
-        }
-
-        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
-            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
-        }
-
-        xhtml.startElement(name, newAttributes);
-    }
-
-    @Override
-    public void endElement(
-            String uri, String local, String name) throws SAXException {
-        if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = mapper.mapSafeElement(name);
-            if (safe != null) {
-                xhtml.endElement(safe);
-            } else if (XHTMLContentHandler.ENDLINE.contains(
-                    name.toLowerCase(Locale.ENGLISH))) {
-                // TIKA-343: Replace closing block tags (and <br/>) with a
-                // newline unless the HtmlMapper above has already mapped
-                // them to something else
-                xhtml.newline();
-            }
-        }
-
-        if (titleLevel > 0) {
-            titleLevel--;
-            if (titleLevel == 0 && !isTitleSetToMetadata) {
-                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
-                isTitleSetToMetadata = true;
-            }
-        }
-        if (bodyLevel > 0) {
-            bodyLevel--;
-        }
-        if (discardLevel > 0) {
-            discardLevel--;
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        if (titleLevel > 0 && bodyLevel == 0) {
-            title.append(ch, start, length);
-        }
-        if (bodyLevel > 0 && discardLevel == 0) {
-            super.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length)
-            throws SAXException {
-        if (bodyLevel > 0 && discardLevel == 0) {
-            super.ignorableWhitespace(ch, start, length);
-        }
-    }
-
-    private String resolve(String url) {
-        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
-    }
-
-    private String resolve(String base, String url) {
-        url = url.trim();
-
-        // Return the URL as-is if no base URL is available or if the URL
-        // matches a common non-hierarchical or pseudo URI prefix
-        String lower = url.toLowerCase(Locale.ENGLISH);
-        if (base == null
-                || lower.startsWith("urn:")
-                || lower.startsWith("mailto:")
-                || lower.startsWith("tel:")
-                || lower.startsWith("data:")
-                || lower.startsWith("javascript:")
-                || lower.startsWith("about:")) {
-            return url;
-        }
-
-        try {
-            URL baseURL = new URL(base.trim());
-
-            // We need to handle one special case, where the relativeUrl is
-            // just a query string (like "?pid=1"), and the baseUrl doesn't
-            // end with a '/'. In that case, the URL class removes the last
-            // portion of the path, which we don't want.
-            String path = baseURL.getPath();
-            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
-                return new URL(
-                        baseURL.getProtocol(),
-                        baseURL.getHost(), baseURL.getPort(),
-                        baseURL.getPath() + url).toExternalForm();
-            } else {
-                return new URL(baseURL, url).toExternalForm();
-            }
-        } catch (MalformedURLException e) {
-            // Unknown or broken format; just return the URL as received.
-            return url;
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+    // List of attributes that need to be resolved.
+    private static final Set<String> URI_ATTRIBUTES =
+            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+    private static final Pattern ICBM =
+            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+    private final HtmlMapper mapper;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+    private final StringBuilder title = new StringBuilder();
+    private int bodyLevel = 0;
+    private int discardLevel = 0;
+    private int titleLevel = 0;
+    private boolean isTitleSetToMetadata = false;
+
+    private HtmlHandler(
+            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+        super(xhtml);
+        this.mapper = mapper;
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+
+        // Try to determine the default base URL, if one has not been given
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = name.trim();
+                try {
+                    new URL(name); // test URL format
+                    metadata.set(Metadata.CONTENT_LOCATION, name);
+                } catch (MalformedURLException e) {
+                    // The resource name is not a valid URL, ignore it
+                }
+            }
+        }
+    }
+
+    public HtmlHandler(
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String local, String name, Attributes atts)
+            throws SAXException {
+        if ("TITLE".equals(name) || titleLevel > 0) {
+            titleLevel++;
+        }
+        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+            bodyLevel++;
+        }
+        if (mapper.isDiscardElement(name) || discardLevel > 0) {
+            discardLevel++;
+        }
+
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("META".equals(name) && atts.getValue("content") != null) {
+                // TIKA-478: For cases where we have either a name or
+                // "http-equiv", assume that XHTMLContentHandler will emit
+                // these in the <head>, thus passing them through safely.
+                if (atts.getValue("http-equiv") != null) {
+                    addHtmlMetadata(
+                            atts.getValue("http-equiv"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
+                    addHtmlMetadata(
+                            atts.getValue("name"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("property") != null) {
+                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+                    metadata.add(
+                            atts.getValue("property"),
+                            atts.getValue("content"));
+                }
+            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+                startElementWithSafeAttributes("base", atts);
+                xhtml.endElement("base");
+                metadata.set(
+                        Metadata.CONTENT_LOCATION,
+                        resolve(atts.getValue("href")));
+            } else if ("LINK".equals(name)) {
+                startElementWithSafeAttributes("link", atts);
+                xhtml.endElement("link");
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                startElementWithSafeAttributes(safe, atts);
+            }
+        }
+
+        title.setLength(0);
+    }
+
+    /**
+     * Adds a metadata setting from the HTML <head/> to the Tika metadata
+     * object. The name and value are normalized where possible.
+     */
+    private void addHtmlMetadata(String name, String value) {
+        if (name == null || value == null) {
+            // ignore
+        } else if (name.equalsIgnoreCase("ICBM")) {
+            Matcher m = ICBM.matcher(value);
+            if (m.matches()) {
+                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+                metadata.set(Metadata.LATITUDE, m.group(1));
+                metadata.set(Metadata.LONGITUDE, m.group(2));
+            } else {
+                metadata.set("ICBM", value);
+            }
+        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            //don't overwrite Metadata.CONTENT_TYPE!
+            MediaType type = MediaType.parse(value);
+            if (type != null) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+            } else {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+            }
+        } else {
+            metadata.add(name, value);
+        }
+    }
+
+    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+        if (atts.getLength() == 0) {
+            xhtml.startElement(name);
+            return;
+        }
+
+        boolean isObject = name.equals("object");
+        String codebase = null;
+        if (isObject) {
+            codebase = atts.getValue("", "codebase");
+            if (codebase != null) {
+                codebase = resolve(codebase);
+            } else {
+                codebase = metadata.get(Metadata.CONTENT_LOCATION);
+            }
+        }
+
+        AttributesImpl newAttributes = new AttributesImpl(atts);
+        for (int att = 0; att < newAttributes.getLength(); att++) {
+            String attrName = newAttributes.getLocalName(att);
+            String normAttrName = mapper.mapSafeAttribute(name, attrName);
+            if (normAttrName == null) {
+                newAttributes.removeAttribute(att);
+                att--;
+            } else {
+                // We have a remapped attribute name, so set it as it might have changed.
+                newAttributes.setLocalName(att, normAttrName);
+
+                // And resolve relative links. Eventually this should be pushed
+                // into the HtmlMapper code.
+                if (URI_ATTRIBUTES.contains(normAttrName)) {
+                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+                } else if (isObject && "codebase".equals(normAttrName)) {
+                    newAttributes.setValue(att, codebase);
+                } else if (isObject
+                        && ("data".equals(normAttrName)
+                        || "classid".equals(normAttrName))) {
+                    newAttributes.setValue(
+                            att,
+                            resolve(codebase, newAttributes.getValue(att)));
+                }
+            }
+        }
+
+        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+        }
+
+        xhtml.startElement(name, newAttributes);
+    }
+
+    @Override
+    public void endElement(
+            String uri, String local, String name) throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                xhtml.endElement(safe);
+            } else if (XHTMLContentHandler.ENDLINE.contains(
+                    name.toLowerCase(Locale.ENGLISH))) {
+                // TIKA-343: Replace closing block tags (and <br/>) with a
+                // newline unless the HtmlMapper above has already mapped
+                // them to something else
+                xhtml.newline();
+            }
+        }
+
+        if (titleLevel > 0) {
+            titleLevel--;
+            if (titleLevel == 0 && !isTitleSetToMetadata) {
+                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+                isTitleSetToMetadata = true;
+            }
+        }
+        if (bodyLevel > 0) {
+            bodyLevel--;
+        }
+        if (discardLevel > 0) {
+            discardLevel--;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (titleLevel > 0 && bodyLevel == 0) {
+            title.append(ch, start, length);
+        }
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.ignorableWhitespace(ch, start, length);
+        }
+    }
+
+    private String resolve(String url) {
+        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+    }
+
+    private String resolve(String base, String url) {
+        url = url.trim();
+
+        // Return the URL as-is if no base URL is available or if the URL
+        // matches a common non-hierarchical or pseudo URI prefix
+        String lower = url.toLowerCase(Locale.ENGLISH);
+        if (base == null
+                || lower.startsWith("urn:")
+                || lower.startsWith("mailto:")
+                || lower.startsWith("tel:")
+                || lower.startsWith("data:")
+                || lower.startsWith("javascript:")
+                || lower.startsWith("about:")) {
+            return url;
+        }
+
+        try {
+            URL baseURL = new URL(base.trim());
+
+            // We need to handle one special case, where the relativeUrl is
+            // just a query string (like "?pid=1"), and the baseUrl doesn't
+            // end with a '/'. In that case, the URL class removes the last
+            // portion of the path, which we don't want.
+            String path = baseURL.getPath();
+            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+                return new URL(
+                        baseURL.getProtocol(),
+                        baseURL.getHost(), baseURL.getPort(),
+                        baseURL.getPath() + url).toExternalForm();
+            } else {
+                return new URL(baseURL, url).toExternalForm();
+            }
+        } catch (MalformedURLException e) {
+            // Unknown or broken format; just return the URL as received.
+            return url;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
index 947d26a..1ca7434 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
-    /**
-     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
-     * given element is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the element
-     * will be ignored but the content inside it is still processed. See
-     * the {@link #isDiscardElement(String)} method for a way to discard
-     * the entire contents of an element.
-     *
-     * @param name HTML element name (upper case)
-     * @return XHTML element name (lower case), or
-     * <code>null</code> if the element is unsafe
-     */
-    String mapSafeElement(String name);
-
-    /**
-     * Checks whether all content within the given HTML element should be
-     * discarded instead of including it in the parse output.
-     *
-     * @param name HTML element name (upper case)
-     * @return <code>true</code> if content inside the named element
-     * should be ignored, <code>false</code> otherwise
-     */
-    boolean isDiscardElement(String name);
-
-
-    /**
-     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
-     * given attribute is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the attribute
-     * will be ignored. This method assumes that the element name
-     * is valid and normalised.
-     *
-     * @param elementName   HTML element name (lower case)
-     * @param attributeName HTML attribute name (lower case)
-     * @return XHTML attribute name (lower case), or
-     * <code>null</code> if the element is unsafe
-     */
-    String mapSafeAttribute(String elementName, String attributeName);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+
+
+    /**
+     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+     * given attribute is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the attribute
+     * will be ignored. This method assumes that the element name
+     * is valid and normalised.
+     *
+     * @param elementName   HTML element name (lower case)
+     * @param attributeName HTML attribute name (lower case)
+     * @return XHTML attribute name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeAttribute(String elementName, String attributeName);
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 7d6f021..a9a8aa0 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -1,194 +1,194 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
- * and post-processes the events to produce XHTML and metadata expected by
- * Tika clients.
- */
-public class HtmlParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = 7895315240498733128L;
-
-    private static final MediaType XHTML = MediaType.application("xhtml+xml");
-    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
-    private static final MediaType X_ASP = MediaType.application("x-asp");
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.text("html"),
-                    XHTML,
-                    WAP_XHTML,
-                    X_ASP)));
-
-    private static final ServiceLoader LOADER =
-            new ServiceLoader(HtmlParser.class.getClassLoader());
-
-    /**
-     * HTML schema singleton used to amortise the heavy instantiation time.
-     */
-    private static final Schema HTML_SCHEMA = new HTMLSchema();
-
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // Automatically detect the character encoding
-        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
-                metadata,context.get(ServiceLoader.class, LOADER))) {
-            Charset charset = reader.getCharset();
-            String previous = metadata.get(Metadata.CONTENT_TYPE);
-            MediaType contentType = null;
-            if (previous == null || previous.startsWith("text/html")) {
-                contentType = new MediaType(MediaType.TEXT_HTML, charset);
-            } else if (previous.startsWith("application/xhtml+xml")) {
-                contentType = new MediaType(XHTML, charset);
-            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
-                contentType = new MediaType(WAP_XHTML, charset);
-            } else if (previous.startsWith("application/x-asp")) {
-                contentType = new MediaType(X_ASP, charset);
-            }
-            if (contentType != null) {
-                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
-            }
-            // deprecated, see TIKA-431
-            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
-            // Get the HTML mapper from the parse context
-            HtmlMapper mapper =
-                    context.get(HtmlMapper.class, new HtmlParserMapper());
-
-            // Parse the HTML document
-            org.ccil.cowan.tagsoup.Parser parser =
-                    new org.ccil.cowan.tagsoup.Parser();
-
-            // Use schema from context or default
-            Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
-            // TIKA-528: Reuse share schema to avoid heavy instantiation
-            parser.setProperty(
-                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
-            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
-            parser.setFeature(
-                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
-
-            parser.setContentHandler(new XHTMLDowngradeHandler(
-                    new HtmlHandler(mapper, handler, metadata)));
-
-            parser.parse(reader.asInputSource());
-        }
-    }
-
-    /**
-     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
-     * given element is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the element
-     * will be ignored but the content inside it is still processed. See
-     * the {@link #isDiscardElement(String)} method for a way to discard
-     * the entire contents of an element.
-     * <p/>
-     * Subclasses can override this method to customize the default mapping.
-     *
-     * @param name HTML element name (upper case)
-     * @return XHTML element name (lower case), or
-     * <code>null</code> if the element is unsafe
-     * @since Apache Tika 0.5
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    protected String mapSafeElement(String name) {
-        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
-    }
-
-    /**
-     * Checks whether all content within the given HTML element should be
-     * discarded instead of including it in the parse output. Subclasses
-     * can override this method to customize the set of discarded elements.
-     *
-     * @param name HTML element name (upper case)
-     * @return <code>true</code> if content inside the named element
-     * should be ignored, <code>false</code> otherwise
-     * @since Apache Tika 0.5
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    protected boolean isDiscardElement(String name) {
-        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
-    }
-
-    /**
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    public String mapSafeAttribute(String elementName, String attributeName) {
-        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
-    }
-
-    /**
-     * Adapter class that maintains backwards compatibility with the
-     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
-     * directly would require those methods to be public, which would break
-     * backwards compatibility with subclasses.
-     *
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This class will be removed in Tika 1.0.
-     */
-    private class HtmlParserMapper implements HtmlMapper {
-        public String mapSafeElement(String name) {
-            return HtmlParser.this.mapSafeElement(name);
-        }
-
-        public boolean isDiscardElement(String name) {
-            return HtmlParser.this.isDiscardElement(name);
-        }
-
-        public String mapSafeAttribute(String elementName, String attributeName) {
-            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7895315240498733128L;
+
+    private static final MediaType XHTML = MediaType.application("xhtml+xml");
+    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+    private static final MediaType X_ASP = MediaType.application("x-asp");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.text("html"),
+                    XHTML,
+                    WAP_XHTML,
+                    X_ASP)));
+
+    private static final ServiceLoader LOADER =
+            new ServiceLoader(HtmlParser.class.getClassLoader());
+
+    /**
+     * HTML schema singleton used to amortise the heavy instantiation time.
+     */
+    private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+                metadata,context.get(ServiceLoader.class, LOADER))) {
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType contentType = null;
+            if (previous == null || previous.startsWith("text/html")) {
+                contentType = new MediaType(MediaType.TEXT_HTML, charset);
+            } else if (previous.startsWith("application/xhtml+xml")) {
+                contentType = new MediaType(XHTML, charset);
+            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+                contentType = new MediaType(WAP_XHTML, charset);
+            } else if (previous.startsWith("application/x-asp")) {
+                contentType = new MediaType(X_ASP, charset);
+            }
+            if (contentType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+            }
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            // Get the HTML mapper from the parse context
+            HtmlMapper mapper =
+                    context.get(HtmlMapper.class, new HtmlParserMapper());
+
+            // Parse the HTML document
+            org.ccil.cowan.tagsoup.Parser parser =
+                    new org.ccil.cowan.tagsoup.Parser();
+
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+            // TIKA-528: Reuse share schema to avoid heavy instantiation
+            parser.setProperty(
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+            parser.setFeature(
+                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+            parser.setContentHandler(new XHTMLDowngradeHandler(
+                    new HtmlHandler(mapper, handler, metadata)));
+
+            parser.parse(reader.asInputSource());
+        }
+    }
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     * <p/>
+     * Subclasses can override this method to customize the default mapping.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected String mapSafeElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+    }
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output. Subclasses
+     * can override this method to customize the set of discarded elements.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected boolean isDiscardElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+    }
+
+    /**
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+    }
+
+    /**
+     * Adapter class that maintains backwards compatibility with the
+     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+     * directly would require those methods to be public, which would break
+     * backwards compatibility with subclasses.
+     *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This class will be removed in Tika 1.0.
+     */
+    private class HtmlParserMapper implements HtmlMapper {
+        public String mapSafeElement(String name) {
+            return HtmlParser.this.mapSafeElement(name);
+        }
+
+        public boolean isDiscardElement(String name) {
+            return HtmlParser.this.isDiscardElement(name);
+        }
+
+        public String mapSafeAttribute(String elementName, String attributeName) {
+            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+        }
+    }
+
+}

[28/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 1630edd..9d0a2f0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -1,398 +1,398 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Holds chm listing entries
- */
-public class ChmDirectoryListingSet {
-    private List<DirectoryListingEntry> dlel;
-    private byte[] data;
-    private int placeHolder = -1;
-    private long dataOffset = -1;
-    private int controlDataIndex = -1;
-    private int resetTableIndex = -1;
-
-    private boolean isNotControlDataFound = true;
-    private boolean isNotResetTableFound = true;
-
-    /**
-     * Constructs chm directory listing set
-     * 
-     * @param data
-     *            byte[]
-     * @param chmItsHeader
-     * @param chmItspHeader
-     * @throws TikaException 
-     */
-    public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
-            ChmItspHeader chmItspHeader) throws TikaException {
-        setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
-        ChmCommons.assertByteArrayNotNull(data);
-        setData(data);
-        enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
-    }
-
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("list:=" + getDirectoryListingEntryList().toString()
-                + System.getProperty("line.separator"));
-        sb.append("number of list items:="
-                + getDirectoryListingEntryList().size());
-        return sb.toString();
-    }
-
-    /**
-     * Returns control data index that located in List
-     * 
-     * @return control data index
-     */
-    public int getControlDataIndex() {
-        return controlDataIndex;
-    }
-
-    /**
-     * Sets control data index
-     * 
-     * @param controlDataIndex
-     */
-    protected void setControlDataIndex(int controlDataIndex) {
-        this.controlDataIndex = controlDataIndex;
-    }
-
-    /**
-     * Return index of reset table
-     * 
-     * @return reset table index
-     */
-    public int getResetTableIndex() {
-        return resetTableIndex;
-    }
-
-    /**
-     * Sets reset table index
-     * 
-     * @param resetTableIndex
-     */
-    protected void setResetTableIndex(int resetTableIndex) {
-        this.resetTableIndex = resetTableIndex;
-    }
-
-    /**
-     * Sets place holder
-     * 
-     * @param placeHolder
-     */
-    private void setPlaceHolder(int placeHolder) {
-        this.placeHolder = placeHolder;
-    }
-
-    private ChmPmglHeader PMGLheader;
-    /**
-     * Enumerates chm directory listing entries
-     * 
-     * @param chmItsHeader
-     *            chm itsf PMGLheader
-     * @param chmItspHeader
-     *            chm itsp PMGLheader
-     */
-    private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
-            ChmItspHeader chmItspHeader) {
-        try {
-            int startPmgl = chmItspHeader.getIndex_head();
-            int stopPmgl = chmItspHeader.getUnknown_0024();
-            int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
-                    .getHeader_len());
-            setDataOffset(chmItsHeader.getDataOffset());
-
-            /* loops over all pmgls */
-            byte[] dir_chunk = null;
-            for (int i = startPmgl; i>=0; ) {
-                dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
-                int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
-                dir_chunk = ChmCommons
-                        .copyOfRange(getData(), start,
-                                start +(int) chmItspHeader.getBlock_len());
-
-                PMGLheader = new ChmPmglHeader();
-                PMGLheader.parse(dir_chunk, PMGLheader);
-                enumerateOneSegment(dir_chunk);
-                
-                i=PMGLheader.getBlockNext();
-                dir_chunk = null;
-            }
-        } catch (Exception e) {
-            e.printStackTrace();
-        } finally {
-            setData(null);
-        }
-    }
-
-    /**
-     * Checks control data
-     * 
-     * @param dle
-     *            chm directory listing entry
-     */
-    private void checkControlData(DirectoryListingEntry dle) {
-        if (isNotControlDataFound) {
-            if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
-                setControlDataIndex(getDirectoryListingEntryList().size());
-                isNotControlDataFound = false;
-            }
-        }
-    }
-
-    /**
-     * Checks reset table
-     * 
-     * @param dle
-     *            chm directory listing entry
-     */
-    private void checkResetTable(DirectoryListingEntry dle) {
-        if (isNotResetTableFound) {
-            if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
-                setResetTableIndex(getDirectoryListingEntryList().size());
-                isNotResetTableFound = false;
-            }
-        }
-    }
-
-    public static final boolean startsWith(byte[] data, String prefix) {
-        for (int i=0; i<prefix.length(); i++) {
-            if (data[i]!=prefix.charAt(i)) {
-                return false;
-            }
-        }
-        
-        return true;
-    }
-    /**
-     * Enumerates chm directory listing entries in single chm segment
-     * 
-     * @param dir_chunk
-     */
-    private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
-//        try {
-            if (dir_chunk != null) {
-                int header_len;
-                if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
-                    header_len = ChmConstants.CHM_PMGI_LEN;
-                    return; //skip PMGI
-                }
-                else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
-                    header_len = ChmConstants.CHM_PMGL_LEN;
-                }
-                else {
-                    throw new ChmParsingException("Bad dir entry block.");
-                }
-
-                placeHolder = header_len;
-                //setPlaceHolder(header_len);
-                while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
-                        /*&& dir_chunk[placeHolder - 1] != 115*/) 
-                {
-                    //get entry name length
-                    int strlen = 0;// = getEncint(data);
-                    byte temp;
-                    while ((temp=dir_chunk[placeHolder++]) >= 0x80)
-                    {
-                        strlen <<= 7;
-                        strlen += temp & 0x7f;
-                    }
-
-                    strlen = (strlen << 7) + temp & 0x7f;
-                    
-                    if (strlen>dir_chunk.length) {
-                        throw new ChmParsingException("Bad data of a string length.");
-                    }
-                    
-                    DirectoryListingEntry dle = new DirectoryListingEntry();
-                    dle.setNameLength(strlen);
-                    dle.setName(new String(ChmCommons.copyOfRange(
-                                dir_chunk, placeHolder,
-                                (placeHolder + dle.getNameLength())), UTF_8));
-
-                    checkControlData(dle);
-                    checkResetTable(dle);
-                    setPlaceHolder(placeHolder
-                            + dle.getNameLength());
-
-                    /* Sets entry type */
-                    if (placeHolder < dir_chunk.length
-                            && dir_chunk[placeHolder] == 0)
-                        dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
-                    else
-                        dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-
-                    setPlaceHolder(placeHolder + 1);
-                    dle.setOffset(getEncint(dir_chunk));
-                    dle.setLength(getEncint(dir_chunk));
-                    getDirectoryListingEntryList().add(dle);
-                }
-                
-//                int indexWorkData = ChmCommons.indexOf(dir_chunk,
-//                        "::".getBytes(UTF_8));
-//                int indexUserData = ChmCommons.indexOf(dir_chunk,
-//                        "/".getBytes(UTF_8));
-//
-//                if (indexUserData>=0 && indexUserData < indexWorkData)
-//                    setPlaceHolder(indexUserData);
-//                else if (indexWorkData>=0) {
-//                    setPlaceHolder(indexWorkData);
-//                }
-//                else {
-//                    setPlaceHolder(indexUserData);
-//                }
-//
-//                if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
-//                        && dir_chunk[placeHolder - 1] != 115) {// #{
-//                    do {
-//                        if (dir_chunk[placeHolder - 1] > 0) {
-//                            DirectoryListingEntry dle = new DirectoryListingEntry();
-//
-//                            // two cases: 1. when dir_chunk[placeHolder -
-//                            // 1] == 0x73
-//                            // 2. when dir_chunk[placeHolder + 1] == 0x2f
-//                            doNameCheck(dir_chunk, dle);
-//
-//                            // dle.setName(new
-//                            // String(Arrays.copyOfRange(dir_chunk,
-//                            // placeHolder, (placeHolder +
-//                            // dle.getNameLength()))));
-//                            dle.setName(new String(ChmCommons.copyOfRange(
-//                                    dir_chunk, placeHolder,
-//                                    (placeHolder + dle.getNameLength())), UTF_8));
-//                            checkControlData(dle);
-//                            checkResetTable(dle);
-//                            setPlaceHolder(placeHolder
-//                                    + dle.getNameLength());
-//
-//                            /* Sets entry type */
-//                            if (placeHolder < dir_chunk.length
-//                                    && dir_chunk[placeHolder] == 0)
-//                                dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
-//                            else
-//                                dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-//
-//                            setPlaceHolder(placeHolder + 1);
-//                            dle.setOffset(getEncint(dir_chunk));
-//                            dle.setLength(getEncint(dir_chunk));
-//                            getDirectoryListingEntryList().add(dle);
-//                        } else
-//                            setPlaceHolder(placeHolder + 1);
-//
-//                    } while (nextEntry(dir_chunk));
-//                }
-            }
-
-//        } catch (Exception e) {
-//            e.printStackTrace();
-//        }
-    }
-
-
-    /**
-     * Returns encrypted integer
-     * 
-     * @param data_chunk
-     * 
-     * @return
-     */
-    private int getEncint(byte[] data_chunk) {
-        byte ob;
-        BigInteger bi = BigInteger.ZERO;
-        byte[] nb = new byte[1];
-
-        if (placeHolder < data_chunk.length) {
-            while ((ob = data_chunk[placeHolder]) < 0) {
-                nb[0] = (byte) ((ob & 0x7f));
-                bi = bi.shiftLeft(7).add(new BigInteger(nb));
-                setPlaceHolder(placeHolder + 1);
-            }
-            nb[0] = (byte) ((ob & 0x7f));
-            bi = bi.shiftLeft(7).add(new BigInteger(nb));
-            setPlaceHolder(placeHolder + 1);
-        }
-        return bi.intValue();
-    }
-
-    /**
-     * Sets chm directory listing entry list
-     * 
-     * @param dlel
-     *            chm directory listing entry list
-     */
-    public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
-        this.dlel = dlel;
-    }
-
-    /**
-     * Returns chm directory listing entry list
-     * 
-     * @return List<DirectoryListingEntry>
-     */
-    public List<DirectoryListingEntry> getDirectoryListingEntryList() {
-        return dlel;
-    }
-
-    /**
-     * Sets data
-     * 
-     * @param data
-     */
-    private void setData(byte[] data) {
-        this.data = data;
-    }
-
-    /**
-     * Returns data
-     * 
-     * @return
-     */
-    private byte[] getData() {
-        return data;
-    }
-
-    /**
-     * Sets data offset
-     * 
-     * @param dataOffset
-     */
-    private void setDataOffset(long dataOffset) {
-        this.dataOffset = dataOffset;
-    }
-
-    /**
-     * Returns data offset
-     * 
-     * @return dataOffset
-     */
-    public long getDataOffset() {
-        return dataOffset;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+    private List<DirectoryListingEntry> dlel;
+    private byte[] data;
+    private int placeHolder = -1;
+    private long dataOffset = -1;
+    private int controlDataIndex = -1;
+    private int resetTableIndex = -1;
+
+    private boolean isNotControlDataFound = true;
+    private boolean isNotResetTableFound = true;
+
+    /**
+     * Constructs chm directory listing set
+     * 
+     * @param data
+     *            byte[]
+     * @param chmItsHeader
+     * @param chmItspHeader
+     * @throws TikaException 
+     */
+    public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) throws TikaException {
+        setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+        ChmCommons.assertByteArrayNotNull(data);
+        setData(data);
+        enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("list:=" + getDirectoryListingEntryList().toString()
+                + System.getProperty("line.separator"));
+        sb.append("number of list items:="
+                + getDirectoryListingEntryList().size());
+        return sb.toString();
+    }
+
+    /**
+     * Returns control data index that located in List
+     * 
+     * @return control data index
+     */
+    public int getControlDataIndex() {
+        return controlDataIndex;
+    }
+
+    /**
+     * Sets control data index
+     * 
+     * @param controlDataIndex
+     */
+    protected void setControlDataIndex(int controlDataIndex) {
+        this.controlDataIndex = controlDataIndex;
+    }
+
+    /**
+     * Return index of reset table
+     * 
+     * @return reset table index
+     */
+    public int getResetTableIndex() {
+        return resetTableIndex;
+    }
+
+    /**
+     * Sets reset table index
+     * 
+     * @param resetTableIndex
+     */
+    protected void setResetTableIndex(int resetTableIndex) {
+        this.resetTableIndex = resetTableIndex;
+    }
+
+    /**
+     * Sets place holder
+     * 
+     * @param placeHolder
+     */
+    private void setPlaceHolder(int placeHolder) {
+        this.placeHolder = placeHolder;
+    }
+
+    private ChmPmglHeader PMGLheader;
+    /**
+     * Enumerates chm directory listing entries
+     * 
+     * @param chmItsHeader
+     *            chm itsf PMGLheader
+     * @param chmItspHeader
+     *            chm itsp PMGLheader
+     */
+    private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) {
+        try {
+            int startPmgl = chmItspHeader.getIndex_head();
+            int stopPmgl = chmItspHeader.getUnknown_0024();
+            int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+                    .getHeader_len());
+            setDataOffset(chmItsHeader.getDataOffset());
+
+            /* loops over all pmgls */
+            byte[] dir_chunk = null;
+            for (int i = startPmgl; i>=0; ) {
+                dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+                int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
+                dir_chunk = ChmCommons
+                        .copyOfRange(getData(), start,
+                                start +(int) chmItspHeader.getBlock_len());
+
+                PMGLheader = new ChmPmglHeader();
+                PMGLheader.parse(dir_chunk, PMGLheader);
+                enumerateOneSegment(dir_chunk);
+                
+                i=PMGLheader.getBlockNext();
+                dir_chunk = null;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            setData(null);
+        }
+    }
+
+    /**
+     * Checks control data
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkControlData(DirectoryListingEntry dle) {
+        if (isNotControlDataFound) {
+            if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+                setControlDataIndex(getDirectoryListingEntryList().size());
+                isNotControlDataFound = false;
+            }
+        }
+    }
+
+    /**
+     * Checks reset table
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkResetTable(DirectoryListingEntry dle) {
+        if (isNotResetTableFound) {
+            if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+                setResetTableIndex(getDirectoryListingEntryList().size());
+                isNotResetTableFound = false;
+            }
+        }
+    }
+
+    public static final boolean startsWith(byte[] data, String prefix) {
+        for (int i=0; i<prefix.length(); i++) {
+            if (data[i]!=prefix.charAt(i)) {
+                return false;
+            }
+        }
+        
+        return true;
+    }
+    /**
+     * Enumerates chm directory listing entries in single chm segment
+     * 
+     * @param dir_chunk
+     */
+    private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+//        try {
+            if (dir_chunk != null) {
+                int header_len;
+                if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
+                    header_len = ChmConstants.CHM_PMGI_LEN;
+                    return; //skip PMGI
+                }
+                else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
+                    header_len = ChmConstants.CHM_PMGL_LEN;
+                }
+                else {
+                    throw new ChmParsingException("Bad dir entry block.");
+                }
+
+                placeHolder = header_len;
+                //setPlaceHolder(header_len);
+                while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+                        /*&& dir_chunk[placeHolder - 1] != 115*/) 
+                {
+                    //get entry name length
+                    int strlen = 0;// = getEncint(data);
+                    byte temp;
+                    while ((temp=dir_chunk[placeHolder++]) >= 0x80)
+                    {
+                        strlen <<= 7;
+                        strlen += temp & 0x7f;
+                    }
+
+                    strlen = (strlen << 7) + temp & 0x7f;
+                    
+                    if (strlen>dir_chunk.length) {
+                        throw new ChmParsingException("Bad data of a string length.");
+                    }
+                    
+                    DirectoryListingEntry dle = new DirectoryListingEntry();
+                    dle.setNameLength(strlen);
+                    dle.setName(new String(ChmCommons.copyOfRange(
+                                dir_chunk, placeHolder,
+                                (placeHolder + dle.getNameLength())), UTF_8));
+
+                    checkControlData(dle);
+                    checkResetTable(dle);
+                    setPlaceHolder(placeHolder
+                            + dle.getNameLength());
+
+                    /* Sets entry type */
+                    if (placeHolder < dir_chunk.length
+                            && dir_chunk[placeHolder] == 0)
+                        dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+                    else
+                        dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+                    setPlaceHolder(placeHolder + 1);
+                    dle.setOffset(getEncint(dir_chunk));
+                    dle.setLength(getEncint(dir_chunk));
+                    getDirectoryListingEntryList().add(dle);
+                }
+                
+//                int indexWorkData = ChmCommons.indexOf(dir_chunk,
+//                        "::".getBytes(UTF_8));
+//                int indexUserData = ChmCommons.indexOf(dir_chunk,
+//                        "/".getBytes(UTF_8));
+//
+//                if (indexUserData>=0 && indexUserData < indexWorkData)
+//                    setPlaceHolder(indexUserData);
+//                else if (indexWorkData>=0) {
+//                    setPlaceHolder(indexWorkData);
+//                }
+//                else {
+//                    setPlaceHolder(indexUserData);
+//                }
+//
+//                if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+//                        && dir_chunk[placeHolder - 1] != 115) {// #{
+//                    do {
+//                        if (dir_chunk[placeHolder - 1] > 0) {
+//                            DirectoryListingEntry dle = new DirectoryListingEntry();
+//
+//                            // two cases: 1. when dir_chunk[placeHolder -
+//                            // 1] == 0x73
+//                            // 2. when dir_chunk[placeHolder + 1] == 0x2f
+//                            doNameCheck(dir_chunk, dle);
+//
+//                            // dle.setName(new
+//                            // String(Arrays.copyOfRange(dir_chunk,
+//                            // placeHolder, (placeHolder +
+//                            // dle.getNameLength()))));
+//                            dle.setName(new String(ChmCommons.copyOfRange(
+//                                    dir_chunk, placeHolder,
+//                                    (placeHolder + dle.getNameLength())), UTF_8));
+//                            checkControlData(dle);
+//                            checkResetTable(dle);
+//                            setPlaceHolder(placeHolder
+//                                    + dle.getNameLength());
+//
+//                            /* Sets entry type */
+//                            if (placeHolder < dir_chunk.length
+//                                    && dir_chunk[placeHolder] == 0)
+//                                dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+//                            else
+//                                dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+//
+//                            setPlaceHolder(placeHolder + 1);
+//                            dle.setOffset(getEncint(dir_chunk));
+//                            dle.setLength(getEncint(dir_chunk));
+//                            getDirectoryListingEntryList().add(dle);
+//                        } else
+//                            setPlaceHolder(placeHolder + 1);
+//
+//                    } while (nextEntry(dir_chunk));
+//                }
+            }
+
+//        } catch (Exception e) {
+//            e.printStackTrace();
+//        }
+    }
+
+
+    /**
+     * Returns encrypted integer
+     * 
+     * @param data_chunk
+     * 
+     * @return
+     */
+    private int getEncint(byte[] data_chunk) {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+
+        if (placeHolder < data_chunk.length) {
+            while ((ob = data_chunk[placeHolder]) < 0) {
+                nb[0] = (byte) ((ob & 0x7f));
+                bi = bi.shiftLeft(7).add(new BigInteger(nb));
+                setPlaceHolder(placeHolder + 1);
+            }
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+            setPlaceHolder(placeHolder + 1);
+        }
+        return bi.intValue();
+    }
+
+    /**
+     * Sets chm directory listing entry list
+     * 
+     * @param dlel
+     *            chm directory listing entry list
+     */
+    public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+        this.dlel = dlel;
+    }
+
+    /**
+     * Returns chm directory listing entry list
+     * 
+     * @return List<DirectoryListingEntry>
+     */
+    public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+        return dlel;
+    }
+
+    /**
+     * Sets data
+     * 
+     * @param data
+     */
+    private void setData(byte[] data) {
+        this.data = data;
+    }
+
+    /**
+     * Returns data
+     * 
+     * @return
+     */
+    private byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Sets data offset
+     * 
+     * @param dataOffset
+     */
+    private void setDataOffset(long dataOffset) {
+        this.dataOffset = dataOffset;
+    }
+
+    /**
+     * Returns data offset
+     * 
+     * @return dataOffset
+     */
+    public long getDataOffset() {
+        return dataOffset;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
index a231e14..2c4dc4e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
@@ -1,492 +1,492 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
- * Total header length, including header section table and following data. 000C:
- * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
- * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
- * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
- * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
- * beginning of file 0008: QWORD Length of section Following the header section
- * table is 8 bytes of additional header data. In Version 2 files, this data is
- * not there and the content section starts immediately after the directory.
- * 
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1}
- * 
- */
-/* structure of ITSF headers */
-public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
-    private static final long serialVersionUID = 2215291838533213826L;
-    private byte[] signature;
-    private int version; /* 4 */
-    private int header_len; /* 8 */
-    private int unknown_000c; /* c */
-    private long last_modified; /* 10 */
-    private long lang_id; /* 14 */
-    private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
-    private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
-    private long unknown_offset; /* 38 */
-    private long unknown_len; /* 40 */
-    private long dir_offset; /* 48 */
-    private long dir_len; /* 50 */
-    private long data_offset; /* 58 (Not present before V3) */
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    public ChmItsfHeader() {
-        signature = ChmConstants.ITSF.getBytes(UTF_8); /* 0 (ITSF) */
-    }
-
-    /**
-     * Prints the values of ChmfHeader
-     */
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append(new String(getSignature(), UTF_8) + " ");
-        sb.append(getVersion() + " ");
-        sb.append(getHeaderLen() + " ");
-        sb.append(getUnknown_000c() + " ");
-        sb.append(getLastModified() + " ");
-        sb.append(getLangId() + " ");
-        sb.append(getDir_uuid() + " ");
-        sb.append(getStream_uuid() + " ");
-        sb.append(getUnknownOffset() + " ");
-        sb.append(getUnknownLen() + " ");
-        sb.append(getDirOffset() + " ");
-        sb.append(getDirLen() + " ");
-        sb.append(getDataOffset() + " ");
-        return sb.toString();
-    }
-
-    /**
-     * Returns a signature of itsf header
-     * 
-     * @return itsf header
-     */
-    public byte[] getSignature() {
-        return signature;
-    }
-
-    /**
-     * Sets itsf header signature
-     * 
-     * @param signature
-     */
-    protected void setSignature(byte[] signature) {
-        this.signature = signature;
-    }
-
-    /**
-     * Returns itsf header version
-     * 
-     * @return itsf version
-     */
-    public int getVersion() {
-        return version;
-    }
-
-    /**
-     * Sets itsf version
-     * 
-     * @param version
-     */
-    protected void setVersion(int version) {
-        this.version = version;
-    }
-
-    /**
-     * Returns itsf header length
-     * 
-     * @return length
-     */
-    public int getHeaderLen() {
-        return header_len;
-    }
-
-    /**
-     * Sets itsf header length
-     * 
-     * @param header_len
-     */
-    protected void setHeaderLen(int header_len) {
-        this.header_len = header_len;
-    }
-
-    /**
-     * Returns unknown_00c value
-     * 
-     * @return unknown_00c
-     */
-    public int getUnknown_000c() {
-        return unknown_000c;
-    }
-
-    /**
-     * Sets unknown_00c
-     * 
-     * @param unknown_000c
-     */
-    protected void setUnknown_000c(int unknown_000c) {
-        this.unknown_000c = unknown_000c;
-    }
-
-    /**
-     * Returns last modified date of the chm file
-     * 
-     * @return last modified date as long
-     */
-    public long getLastModified() {
-        return last_modified;
-    }
-
-    /**
-     * Sets last modified date of the chm file
-     * 
-     * @param last_modified
-     */
-    protected void setLastModified(long last_modified) {
-        this.last_modified = last_modified;
-    }
-
-    /**
-     * Returns language ID
-     * 
-     * @return language_id
-     */
-    public long getLangId() {
-        return lang_id;
-    }
-
-    /**
-     * Sets language_id
-     * 
-     * @param lang_id
-     */
-    protected void setLangId(long lang_id) {
-        this.lang_id = lang_id;
-    }
-
-    /**
-     * Returns directory uuid
-     * 
-     * @return dir_uuid
-     */
-    public byte[] getDir_uuid() {
-        return dir_uuid;
-    }
-
-    /**
-     * Sets directory uuid
-     * 
-     * @param dir_uuid
-     */
-    protected void setDir_uuid(byte[] dir_uuid) {
-        this.dir_uuid = dir_uuid;
-    }
-
-    /**
-     * Returns stream uuid
-     * 
-     * @return stream_uuid
-     */
-    public byte[] getStream_uuid() {
-        return stream_uuid;
-    }
-
-    /**
-     * Sets stream uuid
-     * 
-     * @param stream_uuid
-     */
-    protected void setStream_uuid(byte[] stream_uuid) {
-        this.stream_uuid = stream_uuid;
-    }
-
-    /**
-     * Returns unknown offset
-     * 
-     * @return unknown_offset
-     */
-    public long getUnknownOffset() {
-        return unknown_offset;
-    }
-
-    /**
-     * Sets unknown offset
-     * 
-     * @param unknown_offset
-     */
-    protected void setUnknownOffset(long unknown_offset) {
-        this.unknown_offset = unknown_offset;
-    }
-
-    /**
-     * Returns unknown length
-     * 
-     * @return unknown_length
-     */
-    public long getUnknownLen() {
-        return unknown_len;
-    }
-
-    /**
-     * Sets unknown length
-     * 
-     * @param unknown_len
-     */
-    protected void setUnknownLen(long unknown_len) {
-        this.unknown_len = unknown_len;
-    }
-
-    /**
-     * Returns directory offset
-     * 
-     * @return directory_offset
-     */
-    public long getDirOffset() {
-        return dir_offset;
-    }
-
-    /**
-     * Sets directory offset
-     * 
-     * @param dir_offset
-     */
-    protected void setDirOffset(long dir_offset) {
-        this.dir_offset = dir_offset;
-    }
-
-    /**
-     * Returns directory length
-     * 
-     * @return directory_offset
-     */
-    public long getDirLen() {
-        return dir_len;
-    }
-
-    /**
-     * Sets directory length
-     * 
-     * @param dir_len
-     */
-    protected void setDirLen(long dir_len) {
-        this.dir_len = dir_len;
-    }
-
-    /**
-     * Returns data offset
-     * 
-     * @return data_offset
-     */
-    public long getDataOffset() {
-        return data_offset;
-    }
-
-    /**
-     * Sets data offset
-     * 
-     * @param data_offset
-     */
-    protected void setDataOffset(long data_offset) {
-        this.data_offset = data_offset;
-    }
-
-    /**
-     * Copies 4 first bytes of the byte[]
-     * 
-     * @param data
-     * @param chmItsfHeader
-     * @param count
-     * @throws TikaException 
-     */
-    private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
-            int count) throws TikaException {
-        ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
-        System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-    }
-
-    /**
-     * Copies X bytes of source byte[] to the dest byte[]
-     * 
-     * @param data
-     * @param dest
-     * @param count
-     * @return
-     */
-    private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
-        System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-        return dest;
-    }
-
-    /**
-     * Takes 8 bytes and reverses them
-     * 
-     * @param data
-     * @param dest
-     * @return
-     * @throws TikaException 
-     */
-    private long unmarshalUint64(byte[] data, long dest) throws TikaException{
-        byte[] temp = new byte[8];
-        int i, j;
-
-        if (8 > this.getDataRemained())
-            throw new TikaException("8 > this.getDataRemained()");
-
-        for (i = 8, j = 7; i > 0; i--) {
-            temp[j--] = data[this.getCurrentPlace()];
-            this.setCurrentPlace(this.getCurrentPlace() + 1);
-        }
-
-        dest = new BigInteger(temp).longValue();
-        this.setDataRemained(this.getDataRemained() - 8);
-        return dest;
-    }
-
-    private int unmarshalInt32(byte[] data, int dest) throws TikaException{
-        ChmAssert.assertByteArrayNotNull(data);
-
-        if (4 > this.getDataRemained())
-            throw new TikaException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        this.setDataRemained(this.getDataRemained() - 4);
-        return dest;
-    }
-
-    private long unmarshalUInt32(byte[] data, long dest) throws TikaException{
-        ChmAssert.assertByteArrayNotNull(data);
-        if (4 > getDataRemained())
-            throw new TikaException("4 > dataLenght");
-        dest = data[this.getCurrentPlace()]
-                | data[this.getCurrentPlace() + 1] << 8
-                | data[this.getCurrentPlace() + 2] << 16
-                | data[this.getCurrentPlace() + 3] << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    public static void main(String[] args) {
-    }
-
-    /**
-     * Sets data remained to be processed
-     * 
-     * @param dataRemained
-     */
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    /**
-     * Returns data remained
-     * 
-     * @return data_remainned
-     */
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    /**
-     * Sets current place in the byte[]
-     * 
-     * @param currentPlace
-     */
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    /**
-     * Returns current place in the byte[]
-     * 
-     * @return current place
-     */
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
-        if (data.length < ChmConstants.CHM_ITSF_V2_LEN
-                || data.length > ChmConstants.CHM_ITSF_V3_LEN)
-            throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
-
-        chmItsfHeader.setDataRemained(data.length);
-        chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
-        chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
-        chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
-        chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
-        chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
-        chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
-        chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
-        chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
-        chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
-        chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
-        chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
-        chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
-        if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF))
-            throw new TikaException("seems not valid file");
-        if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
-            if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
-                throw new TikaException("something wrong with header");
-        } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
-            if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
-                throw new TikaException("unknown v3 header lenght");
-        } else
-            throw new ChmParsingException("unsupported chm format");
-
-        /*
-         * now, if we have a V3 structure, unmarshal the rest, otherwise,
-         * compute it
-         */
-        if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
-            if (chmItsfHeader.getDataRemained() >= 0)
-                chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
-                        + chmItsfHeader.getDirLen());
-            else
-                throw new TikaException("cannot set data offset, no data remained");
-        } else
-            chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
-                    + chmItsfHeader.getDirLen());
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
+ * Total header length, including header section table and following data. 000C:
+ * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
+ * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
+ * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
+ * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
+ * beginning of file 0008: QWORD Length of section Following the header section
+ * table is 8 bytes of additional header data. In Version 2 files, this data is
+ * not there and the content section starts immediately after the directory.
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ * 
+ */
+/* structure of ITSF headers */
+public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
+    private static final long serialVersionUID = 2215291838533213826L;
+    private byte[] signature;
+    private int version; /* 4 */
+    private int header_len; /* 8 */
+    private int unknown_000c; /* c */
+    private long last_modified; /* 10 */
+    private long lang_id; /* 14 */
+    private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
+    private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
+    private long unknown_offset; /* 38 */
+    private long unknown_len; /* 40 */
+    private long dir_offset; /* 48 */
+    private long dir_len; /* 50 */
+    private long data_offset; /* 58 (Not present before V3) */
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    public ChmItsfHeader() {
+        signature = ChmConstants.ITSF.getBytes(UTF_8); /* 0 (ITSF) */
+    }
+
+    /**
+     * Prints the values of ChmfHeader
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append(new String(getSignature(), UTF_8) + " ");
+        sb.append(getVersion() + " ");
+        sb.append(getHeaderLen() + " ");
+        sb.append(getUnknown_000c() + " ");
+        sb.append(getLastModified() + " ");
+        sb.append(getLangId() + " ");
+        sb.append(getDir_uuid() + " ");
+        sb.append(getStream_uuid() + " ");
+        sb.append(getUnknownOffset() + " ");
+        sb.append(getUnknownLen() + " ");
+        sb.append(getDirOffset() + " ");
+        sb.append(getDirLen() + " ");
+        sb.append(getDataOffset() + " ");
+        return sb.toString();
+    }
+
+    /**
+     * Returns a signature of itsf header
+     * 
+     * @return itsf header
+     */
+    public byte[] getSignature() {
+        return signature;
+    }
+
+    /**
+     * Sets itsf header signature
+     * 
+     * @param signature
+     */
+    protected void setSignature(byte[] signature) {
+        this.signature = signature;
+    }
+
+    /**
+     * Returns itsf header version
+     * 
+     * @return itsf version
+     */
+    public int getVersion() {
+        return version;
+    }
+
+    /**
+     * Sets itsf version
+     * 
+     * @param version
+     */
+    protected void setVersion(int version) {
+        this.version = version;
+    }
+
+    /**
+     * Returns itsf header length
+     * 
+     * @return length
+     */
+    public int getHeaderLen() {
+        return header_len;
+    }
+
+    /**
+     * Sets itsf header length
+     * 
+     * @param header_len
+     */
+    protected void setHeaderLen(int header_len) {
+        this.header_len = header_len;
+    }
+
+    /**
+     * Returns unknown_00c value
+     * 
+     * @return unknown_00c
+     */
+    public int getUnknown_000c() {
+        return unknown_000c;
+    }
+
+    /**
+     * Sets unknown_00c
+     * 
+     * @param unknown_000c
+     */
+    protected void setUnknown_000c(int unknown_000c) {
+        this.unknown_000c = unknown_000c;
+    }
+
+    /**
+     * Returns last modified date of the chm file
+     * 
+     * @return last modified date as long
+     */
+    public long getLastModified() {
+        return last_modified;
+    }
+
+    /**
+     * Sets last modified date of the chm file
+     * 
+     * @param last_modified
+     */
+    protected void setLastModified(long last_modified) {
+        this.last_modified = last_modified;
+    }
+
+    /**
+     * Returns language ID
+     * 
+     * @return language_id
+     */
+    public long getLangId() {
+        return lang_id;
+    }
+
+    /**
+     * Sets language_id
+     * 
+     * @param lang_id
+     */
+    protected void setLangId(long lang_id) {
+        this.lang_id = lang_id;
+    }
+
+    /**
+     * Returns directory uuid
+     * 
+     * @return dir_uuid
+     */
+    public byte[] getDir_uuid() {
+        return dir_uuid;
+    }
+
+    /**
+     * Sets directory uuid
+     * 
+     * @param dir_uuid
+     */
+    protected void setDir_uuid(byte[] dir_uuid) {
+        this.dir_uuid = dir_uuid;
+    }
+
+    /**
+     * Returns stream uuid
+     * 
+     * @return stream_uuid
+     */
+    public byte[] getStream_uuid() {
+        return stream_uuid;
+    }
+
+    /**
+     * Sets stream uuid
+     * 
+     * @param stream_uuid
+     */
+    protected void setStream_uuid(byte[] stream_uuid) {
+        this.stream_uuid = stream_uuid;
+    }
+
+    /**
+     * Returns unknown offset
+     * 
+     * @return unknown_offset
+     */
+    public long getUnknownOffset() {
+        return unknown_offset;
+    }
+
+    /**
+     * Sets unknown offset
+     * 
+     * @param unknown_offset
+     */
+    protected void setUnknownOffset(long unknown_offset) {
+        this.unknown_offset = unknown_offset;
+    }
+
+    /**
+     * Returns unknown length
+     * 
+     * @return unknown_length
+     */
+    public long getUnknownLen() {
+        return unknown_len;
+    }
+
+    /**
+     * Sets unknown length
+     * 
+     * @param unknown_len
+     */
+    protected void setUnknownLen(long unknown_len) {
+        this.unknown_len = unknown_len;
+    }
+
+    /**
+     * Returns directory offset
+     * 
+     * @return directory_offset
+     */
+    public long getDirOffset() {
+        return dir_offset;
+    }
+
+    /**
+     * Sets directory offset
+     * 
+     * @param dir_offset
+     */
+    protected void setDirOffset(long dir_offset) {
+        this.dir_offset = dir_offset;
+    }
+
+    /**
+     * Returns directory length
+     * 
+     * @return directory_offset
+     */
+    public long getDirLen() {
+        return dir_len;
+    }
+
+    /**
+     * Sets directory length
+     * 
+     * @param dir_len
+     */
+    protected void setDirLen(long dir_len) {
+        this.dir_len = dir_len;
+    }
+
+    /**
+     * Returns data offset
+     * 
+     * @return data_offset
+     */
+    public long getDataOffset() {
+        return data_offset;
+    }
+
+    /**
+     * Sets data offset
+     * 
+     * @param data_offset
+     */
+    protected void setDataOffset(long data_offset) {
+        this.data_offset = data_offset;
+    }
+
+    /**
+     * Copies 4 first bytes of the byte[]
+     * 
+     * @param data
+     * @param chmItsfHeader
+     * @param count
+     * @throws TikaException 
+     */
+    private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
+            int count) throws TikaException {
+        ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
+        System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+    }
+
+    /**
+     * Copies X bytes of source byte[] to the dest byte[]
+     * 
+     * @param data
+     * @param dest
+     * @param count
+     * @return
+     */
+    private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
+        System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+        return dest;
+    }
+
+    /**
+     * Takes 8 bytes and reverses them
+     * 
+     * @param data
+     * @param dest
+     * @return
+     * @throws TikaException 
+     */
+    private long unmarshalUint64(byte[] data, long dest) throws TikaException{
+        byte[] temp = new byte[8];
+        int i, j;
+
+        if (8 > this.getDataRemained())
+            throw new TikaException("8 > this.getDataRemained()");
+
+        for (i = 8, j = 7; i > 0; i--) {
+            temp[j--] = data[this.getCurrentPlace()];
+            this.setCurrentPlace(this.getCurrentPlace() + 1);
+        }
+
+        dest = new BigInteger(temp).longValue();
+        this.setDataRemained(this.getDataRemained() - 8);
+        return dest;
+    }
+
+    private int unmarshalInt32(byte[] data, int dest) throws TikaException{
+        ChmAssert.assertByteArrayNotNull(data);
+
+        if (4 > this.getDataRemained())
+            throw new TikaException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        this.setDataRemained(this.getDataRemained() - 4);
+        return dest;
+    }
+
+    private long unmarshalUInt32(byte[] data, long dest) throws TikaException{
+        ChmAssert.assertByteArrayNotNull(data);
+        if (4 > getDataRemained())
+            throw new TikaException("4 > dataLenght");
+        dest = data[this.getCurrentPlace()]
+                | data[this.getCurrentPlace() + 1] << 8
+                | data[this.getCurrentPlace() + 2] << 16
+                | data[this.getCurrentPlace() + 3] << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    public static void main(String[] args) {
+    }
+
+    /**
+     * Sets data remained to be processed
+     * 
+     * @param dataRemained
+     */
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    /**
+     * Returns data remained
+     * 
+     * @return data_remainned
+     */
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    /**
+     * Sets current place in the byte[]
+     * 
+     * @param currentPlace
+     */
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    /**
+     * Returns current place in the byte[]
+     * 
+     * @return current place
+     */
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
+        if (data.length < ChmConstants.CHM_ITSF_V2_LEN
+                || data.length > ChmConstants.CHM_ITSF_V3_LEN)
+            throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
+
+        chmItsfHeader.setDataRemained(data.length);
+        chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
+        chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
+        chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
+        chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
+        chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
+        chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
+        chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
+        chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
+        chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
+        chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
+        chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
+        chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
+        if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF))
+            throw new TikaException("seems not valid file");
+        if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
+            if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
+                throw new TikaException("something wrong with header");
+        } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+            if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
+                throw new TikaException("unknown v3 header lenght");
+        } else
+            throw new ChmParsingException("unsupported chm format");
+
+        /*
+         * now, if we have a V3 structure, unmarshal the rest, otherwise,
+         * compute it
+         */
+        if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+            if (chmItsfHeader.getDataRemained() >= 0)
+                chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+                        + chmItsfHeader.getDirLen());
+            else
+                throw new TikaException("cannot set data offset, no data remained");
+        } else
+            chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+                    + chmItsfHeader.getDirLen());
+    }
+}

[38/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
index ea0d195..24deb86 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
@@ -1,63 +1,63 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.utils;
-
-import static org.junit.Assert.*;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Before;
-import org.junit.Test;
-
-public class ConcurrentUtilsTest {
-
-    @Test
-    public void testExecuteThread() throws Exception {
-        ParseContext context = new ParseContext();
-        Future result = ConcurrentUtils.execute(context, new Runnable() {
-            
-            @Override
-            public void run() {
-                //Do nothing
-                
-            }
-        });
-        
-        assertNull(result.get());
-    }
-    
-    @Test
-    public void testExecuteExecutor() throws Exception {
-        TikaConfig config = TikaConfig.getDefaultConfig();
-        ParseContext context = new ParseContext();
-        context.set(ExecutorService.class, config.getExecutorService());
-        Future result = ConcurrentUtils.execute(context, new Runnable() {
-            
-            @Override
-            public void run() {
-                //Do nothing
-                
-            }
-        });
-        
-        assertNull(result.get());
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import static org.junit.Assert.*;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ConcurrentUtilsTest {
+
+    @Test
+    public void testExecuteThread() throws Exception {
+        ParseContext context = new ParseContext();
+        Future result = ConcurrentUtils.execute(context, new Runnable() {
+            
+            @Override
+            public void run() {
+                //Do nothing
+                
+            }
+        });
+        
+        assertNull(result.get());
+    }
+    
+    @Test
+    public void testExecuteExecutor() throws Exception {
+        TikaConfig config = TikaConfig.getDefaultConfig();
+        ParseContext context = new ParseContext();
+        context.set(ExecutorService.class, config.getExecutorService());
+        Future result = ConcurrentUtils.execute(context, new Runnable() {
+            
+            @Override
+            public void run() {
+                //Do nothing
+                
+            }
+        });
+        
+        assertNull(result.get());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
index a389f33..9f69aed 100644
--- a/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -1,17 +1,17 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-org.apache.tika.parser.external.CompositeExternalParser
-org.apache.tika.parser.mock.MockParser
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.mock.MockParser

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
index 2db72d1..15551f3 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
@@ -1,28 +1,28 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-<properties>
-  <service-loader dynamic="true"/>
-  <parsers>
-    <parser class="org.apache.tika.config.DummyParser"/>
-  </parsers>
-  <executor-service class="org.apache.tika.config.DummyExecutor">
-    <core-threads>3</core-threads>
-    <max-threads>10</max-threads>
-  </executor-service>
-</properties>
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<properties>
+  <service-loader dynamic="true"/>
+  <parsers>
+    <parser class="org.apache.tika.config.DummyParser"/>
+  </parsers>
+  <executor-service class="org.apache.tika.config.DummyExecutor">
+    <core-threads>3</core-threads>
+    <max-threads>10</max-threads>
+  </executor-service>
+</properties>

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/pom.xml b/tika-parser-bundles/pom.xml
index 4da0c98..bcaf4d1 100644
--- a/tika-parser-bundles/pom.xml
+++ b/tika-parser-bundles/pom.xml
@@ -1,176 +1,176 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parent</artifactId>
-    <version>2.0-SNAPSHOT</version>
-    <relativePath>../tika-parent/pom.xml</relativePath>
-  </parent>
-
-  <artifactId>tika-parser-bundles</artifactId>
-  <packaging>pom</packaging>
-  <name>Apache Tika Parser Bundles</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <poi.version>3.13</poi.version>
-    <!-- NOTE: sync codec version with POI -->
-    <codec.version>1.9</codec.version>
-    <pdfbox.version>1.8.10</pdfbox.version>
-  </properties>
-  
-  <modules>
-    <module>tika-parser-advanced-bundle</module>
-    <module>tika-parser-cad-bundle</module>
-    <module>tika-parser-code-bundle</module>
-    <module>tika-parser-crypto-bundle</module>
-    <module>tika-parser-database-bundle</module>
-    <module>tika-parser-ebook-bundle</module>
-    <module>tika-parser-journal-bundle</module>
-    <module>tika-parser-multimedia-bundle</module>
-    <module>tika-parser-office-bundle</module>
-    <module>tika-parser-package-bundle</module>
-    <module>tika-parser-pdf-bundle</module>
-    <module>tika-parser-scientific-bundle</module>
-    <module>tika-parser-text-bundle</module>
-    <module>tika-parser-web-bundle</module>
-  </modules>
-  
-  <dependencies>
-    <!-- Optional OSGi dependencies, used only when running within OSGi -->
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.core</artifactId>
-      <scope>provided</scope>
-      <optional>true</optional>
-    </dependency>
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.compendium</artifactId>
-      <scope>provided</scope>
-      <optional>true</optional>
-    </dependency>
-    <!-- Test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.mockito</groupId>
-      <artifactId>mockito-core</artifactId>
-      <version>1.7</version>
-      <scope>test</scope>
-    </dependency>
-     <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-junit4</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-container-native</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.felix</groupId>
-      <artifactId>org.apache.felix.framework</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-link-assembly</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.url</groupId>
-      <artifactId>pax-url-aether</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-simple</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>javax.inject</groupId>
-      <artifactId>javax.inject</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-test-resources</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-  </dependencies> 
-  <build>
-    <pluginManagement>
-        <plugins>
-          <plugin>
-          <artifactId>maven-failsafe-plugin</artifactId>
-          <executions>
-            <execution>
-              <goals>
-                <goal>integration-test</goal>
-                <goal>verify</goal>
-              </goals>
-            </execution>
-          </executions>
-          <configuration>
-            <systemPropertyVariables>
-              <org.ops4j.pax.logging.DefaultServiceLog.level>
-                WARN
-              </org.ops4j.pax.logging.DefaultServiceLog.level>
-            </systemPropertyVariables>
-            <systemProperties>
-              <property>
-                <name>project.bundle.file</name>
-                <value>target/${project.build.finalName}.jar</value>
-              </property>
-            </systemProperties>
-          </configuration>
-        </plugin>
-        <plugin>
-          <artifactId>maven-assembly-plugin</artifactId>
-          <executions>
-            <execution>
-              <phase>pre-integration-test</phase>
-              <goals>
-                <goal>single</goal>
-              </goals>
-              <configuration>
-                <descriptor>test-bundles.xml</descriptor>
-                <finalName>test</finalName>
-                <attach>false</attach>
-              </configuration>
-            </execution>
-          </executions>
-        </plugin>
-      </plugins>
-    </pluginManagement>
-  </build>
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parent</artifactId>
+    <version>2.0-SNAPSHOT</version>
+    <relativePath>../tika-parent/pom.xml</relativePath>
+  </parent>
+
+  <artifactId>tika-parser-bundles</artifactId>
+  <packaging>pom</packaging>
+  <name>Apache Tika Parser Bundles</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <poi.version>3.13</poi.version>
+    <!-- NOTE: sync codec version with POI -->
+    <codec.version>1.9</codec.version>
+    <pdfbox.version>1.8.10</pdfbox.version>
+  </properties>
+  
+  <modules>
+    <module>tika-parser-advanced-bundle</module>
+    <module>tika-parser-cad-bundle</module>
+    <module>tika-parser-code-bundle</module>
+    <module>tika-parser-crypto-bundle</module>
+    <module>tika-parser-database-bundle</module>
+    <module>tika-parser-ebook-bundle</module>
+    <module>tika-parser-journal-bundle</module>
+    <module>tika-parser-multimedia-bundle</module>
+    <module>tika-parser-office-bundle</module>
+    <module>tika-parser-package-bundle</module>
+    <module>tika-parser-pdf-bundle</module>
+    <module>tika-parser-scientific-bundle</module>
+    <module>tika-parser-text-bundle</module>
+    <module>tika-parser-web-bundle</module>
+  </modules>
+  
+  <dependencies>
+    <!-- Optional OSGi dependencies, used only when running within OSGi -->
+    <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.core</artifactId>
+      <scope>provided</scope>
+      <optional>true</optional>
+    </dependency>
+    <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.compendium</artifactId>
+      <scope>provided</scope>
+      <optional>true</optional>
+    </dependency>
+    <!-- Test dependencies -->
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>1.7</version>
+      <scope>test</scope>
+    </dependency>
+     <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-junit4</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-container-native</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.framework</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-link-assembly</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.url</groupId>
+      <artifactId>pax-url-aether</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.inject</groupId>
+      <artifactId>javax.inject</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-test-resources</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+  </dependencies> 
+  <build>
+    <pluginManagement>
+        <plugins>
+          <plugin>
+          <artifactId>maven-failsafe-plugin</artifactId>
+          <executions>
+            <execution>
+              <goals>
+                <goal>integration-test</goal>
+                <goal>verify</goal>
+              </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <systemPropertyVariables>
+              <org.ops4j.pax.logging.DefaultServiceLog.level>
+                WARN
+              </org.ops4j.pax.logging.DefaultServiceLog.level>
+            </systemPropertyVariables>
+            <systemProperties>
+              <property>
+                <name>project.bundle.file</name>
+                <value>target/${project.build.finalName}.jar</value>
+              </property>
+            </systemProperties>
+          </configuration>
+        </plugin>
+        <plugin>
+          <artifactId>maven-assembly-plugin</artifactId>
+          <executions>
+            <execution>
+              <phase>pre-integration-test</phase>
+              <goals>
+                <goal>single</goal>
+              </goals>
+              <configuration>
+                <descriptor>test-bundles.xml</descriptor>
+                <finalName>test</finalName>
+                <attach>false</attach>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
index 28713fa..9fd0c77 100644
--- a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
@@ -1,82 +1,82 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-advanced-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser advanced bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-advanced-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.advanced.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-advanced-module;inline=true,
-              opennlp-tools;inline=true,
-              opennlp-maxent;inline=true,
-              commons-io;inline=true,
-              jwnl;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.ner.*,
-              org.apache.tika.parser.ner.corenlp.*,
-              org.apache.tika.parser.ner.opennlp.*,
-              org.apache.tika.parser.ner.regex.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              opennlp.maxent;resolution:=optional,
-              opennlp.tools.namefind;resolution:=optional,
-              org.json;resolution:=optional,
-              org.osgi.framework;resolution:=optional,
-              net.didion.jwnl;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-advanced-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser advanced bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-advanced-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.advanced.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-advanced-module;inline=true,
+              opennlp-tools;inline=true,
+              opennlp-maxent;inline=true,
+              commons-io;inline=true,
+              jwnl;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.ner.*,
+              org.apache.tika.parser.ner.corenlp.*,
+              org.apache.tika.parser.ner.opennlp.*,
+              org.apache.tika.parser.ner.regex.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              opennlp.maxent;resolution:=optional,
+              opennlp.tools.namefind;resolution:=optional,
+              org.json;resolution:=optional,
+              org.osgi.framework;resolution:=optional,
+              net.didion.jwnl;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-cad-bundle/pom.xml b/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
index 8570abe..3bd09cc 100644
--- a/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
@@ -1,73 +1,73 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-cad-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser cad bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-cad-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.cad.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-cad-module;inline=true,
-              commons-io;inline=true,
-              commons-codec;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.dwg.*,
-              org.apache.tika.parser.prt.*
-            </Export-Package>
-            <Import-Package>
-              *
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-cad-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser cad bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-cad-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.cad.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-cad-module;inline=true,
+              commons-io;inline=true,
+              commons-codec;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.dwg.*,
+              org.apache.tika.parser.prt.*
+            </Export-Package>
+            <Import-Package>
+              *
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-code-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-code-bundle/pom.xml b/tika-parser-bundles/tika-parser-code-bundle/pom.xml
index fcf4757..efbcf9a 100644
--- a/tika-parser-bundles/tika-parser-code-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-code-bundle/pom.xml
@@ -1,75 +1,75 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-code-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser code bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-code-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.code.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-code-module;inline=true,
-              asm;inline=true,
-              tagsoup;inline=true,
-              jhighlight;inline=true,
-              commons-io;inline=true,
-              commons-codec;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.asm.*,
-              org.apache.tika.parser.code.*,
-              org.apache.tika.parser.executable.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              javax.servlet;resolution:=optional,
-              javax.servlet.http;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-code-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser code bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-code-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.code.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-code-module;inline=true,
+              asm;inline=true,
+              tagsoup;inline=true,
+              jhighlight;inline=true,
+              commons-io;inline=true,
+              commons-codec;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.asm.*,
+              org.apache.tika.parser.code.*,
+              org.apache.tika.parser.executable.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              javax.servlet;resolution:=optional,
+              javax.servlet.http;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml b/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
index 64203fe..034b1fe 100644
--- a/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-crypto-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser crypto bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-crypto-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.crypto.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-crypto-module;inline=true,
-              bcmail-jdk15on;inline=true,
-              bcprov-jdk15on;inline=true,
-              bcpkix-jdk15on;inline=true,
-              commons-io;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.crypto.*,
-            </Export-Package>
-            <Import-Package>
-              *,
-              javax.mail;resolution:=optional,
-              javax.mail.internet;resolution:=optional,
-              org.bouncycastle.cert;resolution:=optional,
-              org.bouncycastle.cert.jcajce;resolution:=optional,
-              org.bouncycastle.cert.ocsp;resolution:=optional,
-              org.bouncycastle.cms.bc;resolution:=optional,
-              org.bouncycastle.operator;resolution:=optional,
-              org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-crypto-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser crypto bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-crypto-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.crypto.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-crypto-module;inline=true,
+              bcmail-jdk15on;inline=true,
+              bcprov-jdk15on;inline=true,
+              bcpkix-jdk15on;inline=true,
+              commons-io;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.crypto.*,
+            </Export-Package>
+            <Import-Package>
+              *,
+              javax.mail;resolution:=optional,
+              javax.mail.internet;resolution:=optional,
+              org.bouncycastle.cert;resolution:=optional,
+              org.bouncycastle.cert.jcajce;resolution:=optional,
+              org.bouncycastle.cert.ocsp;resolution:=optional,
+              org.bouncycastle.cms.bc;resolution:=optional,
+              org.bouncycastle.operator;resolution:=optional,
+              org.bouncycastle.operator.bc;resolution:=optional,
+              org.bouncycastle.tsp;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-database-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-database-bundle/pom.xml b/tika-parser-bundles/tika-parser-database-bundle/pom.xml
index 972dce3..75f1dc0 100644
--- a/tika-parser-bundles/tika-parser-database-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-database-bundle/pom.xml
@@ -1,68 +1,68 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-database-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser database bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-database-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.database.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-database-module;inline=true,
-              commons-io;inline=true,
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.jdbc.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              org.sqlite;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-database-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser database bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-database-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.database.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-database-module;inline=true,
+              commons-io;inline=true,
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.jdbc.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              org.sqlite;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml b/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
index 742ec99..b7dfa7f 100644
--- a/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
@@ -1,72 +1,72 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-ebook-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser ebook bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-ebook-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-bundle</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.ebook.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-ebook-module;inline=true,
-              commons-io;inline=true,
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.epub.*
-            </Export-Package>
-            <Import-Package>
-              *
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-ebook-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser ebook bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-ebook-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-bundle</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.ebook.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-ebook-module;inline=true,
+              commons-io;inline=true,
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.epub.*
+            </Export-Package>
+            <Import-Package>
+              *
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
index c03cb4b..b918a7a 100644
--- a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
@@ -1,80 +1,80 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-journal-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser journal bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-journal-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-bundle</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.journal.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-journal-module;inline=true,
-              commons-io;inline=true,
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.journal.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              javax.ws.rs.core;resolution:=optional,
-              org.apache.cxf.jaxrs.client;resolution:=optional,
-              org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
-              org.json;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-journal-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser journal bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-journal-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-pdf-bundle</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.journal.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-journal-module;inline=true,
+              commons-io;inline=true,
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.journal.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              javax.ws.rs.core;resolution:=optional,
+              org.apache.cxf.jaxrs.client;resolution:=optional,
+              org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
+              org.json;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
index 7b596c9..ab1d1b4 100644
--- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
@@ -1,85 +1,85 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-multimedia-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser multimedia bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-multimedia-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.multimedia.internal.Activator</Bundle-Activator>
-            <_runsystempackages>com.sun.xml.bind.marshaller,
-              com.sun.xml.internal.bind.marshaller</_runsystempackages>
-            <Embed-Dependency>
-              tika-parser-multimedia-module;inline=true,
-              tika-parser-xmp-commons;inline=true,
-              metadata-extractor;inline=true,
-              xmpcore;inline=true,
-              commons-codec;inline=true,
-              commons-io;inline=true,
-              commons-exec;inline=true,
-              jempbox;inline=true,
-              fontbox;inline=true,
-              isoparser;inline=true,
-            </Embed-Dependency>
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.image.*,
-              org.apache.tika.parser.jpeg.*,
-              org.apache.tika.parser.audio.*,
-              org.apache.tika.parser.video.*,
-              org.apache.tika.parser.mp3.*,
-              org.apache.tika.parser.mp4.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              com.adobe.xmp;resolution:=optional,
-              com.adobe.xmp.properties;resolution:=optional,
-              android.util;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-multimedia-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser multimedia bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-multimedia-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.multimedia.internal.Activator</Bundle-Activator>
+            <_runsystempackages>com.sun.xml.bind.marshaller,
+              com.sun.xml.internal.bind.marshaller</_runsystempackages>
+            <Embed-Dependency>
+              tika-parser-multimedia-module;inline=true,
+              tika-parser-xmp-commons;inline=true,
+              metadata-extractor;inline=true,
+              xmpcore;inline=true,
+              commons-codec;inline=true,
+              commons-io;inline=true,
+              commons-exec;inline=true,
+              jempbox;inline=true,
+              fontbox;inline=true,
+              isoparser;inline=true,
+            </Embed-Dependency>
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.image.*,
+              org.apache.tika.parser.jpeg.*,
+              org.apache.tika.parser.audio.*,
+              org.apache.tika.parser.video.*,
+              org.apache.tika.parser.mp3.*,
+              org.apache.tika.parser.mp4.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              com.adobe.xmp;resolution:=optional,
+              com.adobe.xmp.properties;resolution:=optional,
+              android.util;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

[32/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
index 98278e2..8d94c0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
@@ -1,159 +1,159 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
- */
-public class ID3v22Handler implements ID3Tags {
-    private String title;
-    private String artist;
-    private String album;
-    private String year;
-    private String composer;
-    private String genre;
-    private String trackNumber;
-    private String albumArtist;
-    private String disc;
-    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
-    public ID3v22Handler(ID3v2Frame frame)
-            throws IOException, SAXException, TikaException {
-        RawTagIterator tags = new RawV22TagIterator(frame);
-        while (tags.hasNext()) {
-            RawTag tag = tags.next();
-            if (tag.name.equals("TT2")) {
-                title = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TP1")) {
-                artist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TP2")) {
-                albumArtist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TAL")) {
-                album = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TYE")) {
-                year = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCM")) {
-                composer = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("COM")) {
-                comments.add( getComment(tag.data, 0, tag.data.length) ); 
-            } else if (tag.name.equals("TRK")) {
-                trackNumber = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPA")) {
-                disc = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCO")) {
-                genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
-            }
-        }
-    }
-
-    private String getTagString(byte[] data, int offset, int length) {
-        return ID3v2Frame.getTagString(data, offset, length);
-    }
-    private ID3Comment getComment(byte[] data, int offset, int length) {
-        return ID3v2Frame.getComment(data, offset, length);
-    }
-    
-    protected static String extractGenre(String rawGenre) {
-       int open = rawGenre.indexOf("(");
-       int close = rawGenre.indexOf(")");
-       if (open == -1 && close == -1) {
-          return rawGenre;
-       } else if (open < close) {
-           String genreStr = rawGenre.substring(0, open).trim();
-           try {
-               int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
-               return ID3Tags.GENRES[genreID];
-           } catch(ArrayIndexOutOfBoundsException invalidNum) {
-              return genreStr;
-           } catch(NumberFormatException notANum) {
-              return genreStr;
-           }
-       } else {
-          return null;
-       }
-    }
-
-    public boolean getTagsPresent() {
-        return true;
-    }
-
-    public String getTitle() {
-        return title;
-    }
-
-    public String getArtist() {
-        return artist;
-    }
-
-    public String getAlbum() {
-        return album;
-    }
-
-    public String getYear() {
-        return year;
-    }
-    
-    public String getComposer() {
-        return composer;
-    }
-
-    public List<ID3Comment> getComments() {
-        return comments;
-    }
-
-    public String getGenre() {
-        return genre;
-    }
-
-    public String getTrackNumber() {
-        return trackNumber;
-    }
-
-    public String getAlbumArtist() {
-        return albumArtist;
-    }
-
-    public String getDisc() {
-        return disc;
-    }
-
-    /**
-     * ID3v22 doesn't have compilations,
-     *  so returns null;
-     */
-    public String getCompilation() {
-        return null;
-    }
-
-    private class RawV22TagIterator extends RawTagIterator {
-        private RawV22TagIterator(ID3v2Frame frame) {
-            frame.super(3, 3, 1, 0);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v22Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV22TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TAL")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYE")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPA")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCO")) {
+                genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+        return ID3v2Frame.getComment(data, offset, length);
+    }
+    
+    protected static String extractGenre(String rawGenre) {
+       int open = rawGenre.indexOf("(");
+       int close = rawGenre.indexOf(")");
+       if (open == -1 && close == -1) {
+          return rawGenre;
+       } else if (open < close) {
+           String genreStr = rawGenre.substring(0, open).trim();
+           try {
+               int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+               return ID3Tags.GENRES[genreID];
+           } catch(ArrayIndexOutOfBoundsException invalidNum) {
+              return genreStr;
+           } catch(NumberFormatException notANum) {
+              return genreStr;
+           }
+       } else {
+          return null;
+       }
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+    
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    /**
+     * ID3v22 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    private class RawV22TagIterator extends RawTagIterator {
+        private RawV22TagIterator(ID3v2Frame frame) {
+            frame.super(3, 3, 1, 0);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
index 8c5386d..4b67eda 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
@@ -1,138 +1,138 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
- */
-public class ID3v23Handler implements ID3Tags {
-    private String title;
-    private String artist;
-    private String album;
-    private String year;
-    private String composer;
-    private String genre;
-    private String trackNumber;
-    private String albumArtist;
-    private String disc;
-    private String compilation;
-    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
-    public ID3v23Handler(ID3v2Frame frame)
-            throws IOException, SAXException, TikaException {
-        RawTagIterator tags = new RawV23TagIterator(frame);
-        while (tags.hasNext()) {
-            RawTag tag = tags.next();
-            if (tag.name.equals("TIT2")) {
-                title = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPE1")) {
-                artist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPE2")) {
-                albumArtist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TALB")) {
-                album = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TYER")) {
-                year = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCOM")) {
-                composer = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("COMM")) {
-                comments.add( getComment(tag.data, 0, tag.data.length) ); 
-            } else if (tag.name.equals("TRCK")) {
-                trackNumber = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPOS")) {
-                disc = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCMP")) {
-                compilation = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCON")) {
-                genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
-            }
-        }
-    }
-
-    private String getTagString(byte[] data, int offset, int length) {
-        return ID3v2Frame.getTagString(data, offset, length);
-    }
-    private ID3Comment getComment(byte[] data, int offset, int length) {
-       return ID3v2Frame.getComment(data, offset, length);
-    }
-
-    public boolean getTagsPresent() {
-        return true;
-    }
-
-    public String getTitle() {
-        return title;
-    }
-
-    public String getArtist() {
-        return artist;
-    }
-
-    public String getAlbum() {
-        return album;
-    }
-
-    public String getYear() {
-        return year;
-    }
-
-    public String getComposer() {
-        return composer;
-    }
-
-    public List<ID3Comment> getComments() {
-        return comments;
-    }
-
-    public String getGenre() {
-        return genre;
-    }
-
-    public String getTrackNumber() {
-        return trackNumber;
-    }
-
-    public String getAlbumArtist() {
-        return albumArtist;
-    }
-
-    public String getDisc() {
-        return disc;
-    }
-
-    public String getCompilation() {
-        return compilation;
-    }
-
-    private class RawV23TagIterator extends RawTagIterator {
-        private RawV23TagIterator(ID3v2Frame frame) {
-            frame.super(4, 4, 1, 2);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private String compilation;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v23Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV23TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCOM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPOS")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCMP")) {
+                compilation = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+                genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+       return ID3v2Frame.getComment(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    public String getCompilation() {
+        return compilation;
+    }
+
+    private class RawV23TagIterator extends RawTagIterator {
+        private RawV23TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
index 5c16937..caba928 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
@@ -1,143 +1,143 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
- */
-public class ID3v24Handler implements ID3Tags {
-    private String title;
-    private String artist;
-    private String album;
-    private String year;
-    private String composer;
-    private String genre;
-    private String trackNumber;
-    private String albumArtist;
-    private String disc;
-    private String compilation;
-    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
-    public ID3v24Handler(ID3v2Frame frame)
-            throws IOException, SAXException, TikaException {
-        RawTagIterator tags = new RawV24TagIterator(frame);
-        while (tags.hasNext()) {
-            RawTag tag = tags.next();
-            if (tag.name.equals("TIT2")) {
-                title = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPE1")) {
-                artist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPE2")) {
-                albumArtist = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TALB")) {
-                album = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TYER")) {
-                year = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TDRC")) {
-               if(year == null) {
-                  year = getTagString(tag.data, 0, tag.data.length);
-               }
-            } else if (tag.name.equals("TCOM")) {
-                composer = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("COMM")) {
-                comments.add( getComment(tag.data, 0, tag.data.length) ); 
-            } else if (tag.name.equals("TRCK")) {
-                trackNumber = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TPOS")) {
-                disc = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCMP")) {
-                compilation = getTagString(tag.data, 0, tag.data.length); 
-            } else if (tag.name.equals("TCON")) {
-               genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
-            }
-        }
-    }
-
-    private String getTagString(byte[] data, int offset, int length) {
-        return ID3v2Frame.getTagString(data, offset, length);
-    }
-    private ID3Comment getComment(byte[] data, int offset, int length) {
-        return ID3v2Frame.getComment(data, offset, length);
-    }
-
-    public boolean getTagsPresent() {
-        return true;
-    }
-
-    public String getTitle() {
-        return title;
-    }
-
-    public String getArtist() {
-        return artist;
-    }
-
-    public String getAlbum() {
-        return album;
-    }
-
-    public String getYear() {
-        return year;
-    }
-
-    public String getComposer() {
-        return composer;
-    }
-
-    public List<ID3Comment> getComments() {
-        return comments;
-    }
-
-    public String getGenre() {
-        return genre;
-    }
-
-    public String getTrackNumber() {
-        return trackNumber;
-    }
-
-    public String getAlbumArtist() {
-        return albumArtist;
-    }
-
-    public String getDisc() {
-        return disc;
-    }
-
-    public String getCompilation() {
-        return compilation;
-    }
-
-    private class RawV24TagIterator extends RawTagIterator {
-        private RawV24TagIterator(ID3v2Frame frame) {
-            frame.super(4, 4, 1, 2);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private String compilation;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v24Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV24TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TDRC")) {
+               if(year == null) {
+                  year = getTagString(tag.data, 0, tag.data.length);
+               }
+            } else if (tag.name.equals("TCOM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPOS")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCMP")) {
+                compilation = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+               genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+        return ID3v2Frame.getComment(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    public String getCompilation() {
+        return compilation;
+    }
+
+    private class RawV24TagIterator extends RawTagIterator {
+        private RawV24TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
index 458c5e2..41298dd 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
@@ -1,424 +1,424 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.Iterator;
-
-import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-/**
- * A frame of ID3v2 data, which is then passed to a handler to 
- * be turned into useful data.
- */
-public class ID3v2Frame implements MP3Frame {
-    private int majorVersion;
-    private int minorVersion;
-    private int flags;
-    private int length;
-    /** Excludes the header size part */
-    private byte[] extendedHeader;
-    private byte[] data;
-
-    public int getMajorVersion() {
-        return majorVersion;
-    }
-
-    public int getMinorVersion() {
-        return minorVersion;
-    }
-
-    public int getFlags() {
-        return flags;
-    }
-
-    public int getLength() {
-        return length;
-    }
-
-    public byte[] getExtendedHeader() {
-        return extendedHeader;
-    }
-
-    public byte[] getData() {
-        return data;
-    }
-
-    /**
-     * Returns the next ID3v2 Frame in
-     *  the file, or null if the next batch of data
-     *  doesn't correspond to either an ID3v2 header.
-     * If no ID3v2 frame could be detected and the passed in input stream is a
-     * {@code PushbackInputStream}, the bytes read so far are pushed back so
-     * that they can be read again.
-     * ID3v2 Frames should come before all Audio ones.
-     */
-    public static MP3Frame createFrameIfPresent(InputStream inp)
-            throws IOException {
-        int h1 = inp.read();
-        int h2 = inp.read();
-        int h3 = inp.read();
-        
-        // Is it an ID3v2 Frame? 
-        if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
-            int majorVersion = inp.read();
-            int minorVersion = inp.read();
-            if (majorVersion == -1 || minorVersion == -1) {
-                pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
-                return null;
-            }
-            return new ID3v2Frame(majorVersion, minorVersion, inp);
-        }
-
-        // Not a frame header
-        pushBack(inp, h1, h2, h3);
-        return null;
-    }
-
-    /**
-     * Pushes bytes back into the stream if possible. This method is called if
-     * no ID3v2 header could be found at the current stream position.
-     * 
-     * @param inp the input stream
-     * @param bytes the bytes to be pushed back
-     * @throws IOException if an error occurs
-     */
-    private static void pushBack(InputStream inp, int... bytes)
-            throws IOException
-    {
-        if (inp instanceof PushbackInputStream)
-        {
-            byte[] buf = new byte[bytes.length];
-            for (int i = 0; i < bytes.length; i++)
-            {
-                buf[i] = (byte) bytes[i];
-            }
-            ((PushbackInputStream) inp).unread(buf);
-        }
-    }
-
-    private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
-            throws IOException {
-        this.majorVersion = majorVersion;
-        this.minorVersion = minorVersion;
-
-        // Get the flags and the length
-        flags = inp.read();
-        length = get7BitsInt(readFully(inp, 4), 0);
-
-        // Do we have an extended header?
-        if ((flags & 0x02) == 0x02) {
-            int size = getInt(readFully(inp, 4));
-            extendedHeader = readFully(inp, size);
-        }
-
-        // Get the frame's data, or at least as much
-        //  of it as we could do
-        data = readFully(inp, length, false);
-    }
-
-    protected static int getInt(byte[] data) {
-        return getInt(data, 0);
-    }
-
-    protected static int getInt(byte[] data, int offset) {
-        int b0 = data[offset+0] & 0xFF;
-        int b1 = data[offset+1] & 0xFF;
-        int b2 = data[offset+2] & 0xFF;
-        int b3 = data[offset+3] & 0xFF;
-        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
-    }
-
-    protected static int getInt3(byte[] data, int offset) {
-        int b0 = data[offset+0] & 0xFF;
-        int b1 = data[offset+1] & 0xFF;
-        int b2 = data[offset+2] & 0xFF;
-        return (b0 << 16) + (b1 << 8) + (b2 << 0);
-    }
-
-    protected static int getInt2(byte[] data, int offset) {
-        int b0 = data[offset+0] & 0xFF;
-        int b1 = data[offset+1] & 0xFF;
-        return (b0 << 8) + (b1 << 0);
-    }
-
-    /**
-     * AKA a Synchsafe integer.
-     * 4 bytes hold a 28 bit number. The highest
-     *  bit in each byte is always 0 and always ignored.
-     */
-    protected static int get7BitsInt(byte[] data, int offset) {
-        int b0 = data[offset+0] & 0x7F;
-        int b1 = data[offset+1] & 0x7F;
-        int b2 = data[offset+2] & 0x7F;
-        int b3 = data[offset+3] & 0x7F;
-        return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
-    }
-
-    protected static byte[] readFully(InputStream inp, int length)
-            throws IOException {
-       return readFully(inp, length, true);
-    }
-    protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
-            throws IOException {
-        byte[] b = new byte[length];
-
-        int pos = 0;
-        int read;
-        while (pos < length) {
-            read = inp.read(b, pos, length-pos);
-            if (read == -1) {
-                if(shortDataIsFatal) {
-                   throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
-                } else {
-                   // Give them what we found
-                   // TODO Log the short read
-                   return b;
-                }
-            }
-            pos += read;
-        }
-
-        return b;
-    }
-    
-    protected static class TextEncoding {
-       public final boolean doubleByte;
-       public final String encoding;
-       private TextEncoding(String encoding, boolean doubleByte) {
-          this.doubleByte = doubleByte;
-          this.encoding = encoding;
-       }
-    }
-    protected static final TextEncoding[] encodings = new TextEncoding[] {
-          new TextEncoding("ISO-8859-1", false),
-          new TextEncoding("UTF-16", true), // With BOM
-          new TextEncoding("UTF-16BE", true), // Without BOM
-          new TextEncoding("UTF-8", false)
-    };
-
-    /**
-     * Returns the (possibly null padded) String at the given offset and
-     * length. String encoding is held in the first byte; 
-     */
-    protected static String getTagString(byte[] data, int offset, int length) {
-        int actualLength = length;
-        if (actualLength == 0) {
-            return "";
-        }
-        if (actualLength == 1 && data[offset] == 0) {
-            return "";
-        }
-
-        // Does it have an encoding flag?
-        // Detect by the first byte being sub 0x20
-        TextEncoding encoding = encodings[0];
-        byte maybeEncodingFlag = data[offset];
-        if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
-            offset++;
-            actualLength--;
-            encoding = encodings[maybeEncodingFlag];
-        }
-        
-        // Trim off null termination / padding (as present) 
-        while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
-           actualLength -= 2;
-        } 
-        while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
-           actualLength--;
-        }
-        if (actualLength == 0) {
-           return "";
-        }
-
-        // TIKA-1024: If it's UTF-16 (with BOM) and all we
-        // have is a naked BOM then short-circuit here
-        // (return empty string), because new String(..)
-        // gives different results on different JVMs
-        if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
-            ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
-             (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
-          return "";
-        }
-
-        try {
-            // Build the base string
-            return new String(data, offset, actualLength, encoding.encoding);
-        } catch (UnsupportedEncodingException e) {
-            throw new RuntimeException(
-                    "Core encoding " + encoding.encoding + " is not available", e);
-        }
-    }
-    /**
-     * Builds up the ID3 comment, by parsing and extracting
-     *  the comment string parts from the given data. 
-     */
-    protected static ID3Comment getComment(byte[] data, int offset, int length) {
-       // Comments must have an encoding
-       int encodingFlag = data[offset];
-       if (encodingFlag >= 0 && encodingFlag < encodings.length) {
-          // Good, valid flag
-       } else {
-          // Invalid string
-          return null;
-       }
-       
-       TextEncoding encoding = encodings[encodingFlag];
-       
-       // First is a 3 byte language
-       String lang = getString(data, offset+1, 3);
-       
-       // After that we have [Desc]\0(\0)[Text]
-       int descStart = offset+4;
-       int textStart = -1;
-       String description = null;
-       String text = null;
-       
-       // Find where the description ends
-       try {
-          for (int i=descStart; i<offset+length; i++) {
-             if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
-                // Handle LE vs BE on low byte text
-                if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
-                   i++;
-                }
-                textStart = i+2;
-                description = new String(data, descStart, i-descStart, encoding.encoding);
-                break;
-             }
-             if (!encoding.doubleByte && data[i]==0) {
-                textStart = i+1;
-                description = new String(data, descStart, i-descStart, encoding.encoding);
-                break;
-             }
-          }
-          
-          // Did we find the end?
-          if (textStart > -1) {
-             text = new String(data, textStart, offset+length-textStart, encoding.encoding);
-          } else {
-             // Assume everything is the text
-             text = new String(data, descStart, offset+length-descStart, encoding.encoding);
-          }
-          
-          // Return
-          return new ID3Comment(lang, description, text);
-       } catch (UnsupportedEncodingException e) {
-          throw new RuntimeException(
-                  "Core encoding " + encoding.encoding + " is not available", e);
-       }
-    }
-
-    /**
-     * Returns the String at the given
-     *  offset and length. Strings are ISO-8859-1 
-     */
-    protected static String getString(byte[] data, int offset, int length) {
-        return new String(data, offset, length, ISO_8859_1);
-    }
-
-
-    /**
-     * Iterates over id3v2 raw tags.
-     * Create an instance of this that configures the
-     *  various length and multipliers.
-     */
-    protected class RawTagIterator implements Iterator<RawTag> {
-        private int nameLength;
-        private int sizeLength;
-        private int sizeMultiplier;
-        private int flagLength;
-
-        private int offset = 0;
-
-        protected RawTagIterator(
-                int nameLength, int sizeLength, int sizeMultiplier,
-                int flagLength) {
-            this.nameLength = nameLength;
-            this.sizeLength = sizeLength;
-            this.sizeMultiplier = sizeMultiplier;
-            this.flagLength = flagLength;
-        }
-
-        public boolean hasNext() {
-            // Check for padding at the end
-            return offset < data.length && data[offset] != 0;
-        }
-
-        public RawTag next() {
-            RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
-                    flagLength, data, offset);
-            offset += tag.getSize();
-            return tag;
-        }
-
-        public void remove() {
-        }
-
-    }
-
-    protected static class RawTag {
-        private int headerSize;
-        protected String name;
-        protected int flag;
-        protected byte[] data;
-
-        private RawTag(
-                int nameLength, int sizeLength, int sizeMultiplier,
-                int flagLength, byte[] frameData, int offset) {
-            headerSize = nameLength + sizeLength + flagLength;
-
-            // Name, normally 3 or 4 bytes
-            name = getString(frameData, offset, nameLength);
-
-            // Size
-            int rawSize;
-            if (sizeLength == 3) {
-                rawSize = getInt3(frameData, offset+nameLength);
-            } else {
-                rawSize = getInt(frameData, offset+nameLength);
-            }
-            int size = rawSize * sizeMultiplier;
-
-            // Flag
-            if (flagLength > 0) {
-                if (flagLength == 1) {
-                    flag = (int)frameData[offset+nameLength+sizeLength];
-                } else {
-                    flag = getInt2(frameData, offset+nameLength+sizeLength);
-                }
-            }
-
-            // Now data
-            int copyFrom = offset+nameLength+sizeLength+flagLength;
-            size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
-            data = new byte[size];
-            System.arraycopy(frameData, copyFrom, data, 0, size);
-        }
-
-        protected int getSize() {
-            return headerSize + data.length;
-        }
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to 
+ * be turned into useful data.
+ */
+public class ID3v2Frame implements MP3Frame {
+    private int majorVersion;
+    private int minorVersion;
+    private int flags;
+    private int length;
+    /** Excludes the header size part */
+    private byte[] extendedHeader;
+    private byte[] data;
+
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    public int getFlags() {
+        return flags;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public byte[] getExtendedHeader() {
+        return extendedHeader;
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Returns the next ID3v2 Frame in
+     *  the file, or null if the next batch of data
+     *  doesn't correspond to either an ID3v2 header.
+     * If no ID3v2 frame could be detected and the passed in input stream is a
+     * {@code PushbackInputStream}, the bytes read so far are pushed back so
+     * that they can be read again.
+     * ID3v2 Frames should come before all Audio ones.
+     */
+    public static MP3Frame createFrameIfPresent(InputStream inp)
+            throws IOException {
+        int h1 = inp.read();
+        int h2 = inp.read();
+        int h3 = inp.read();
+        
+        // Is it an ID3v2 Frame? 
+        if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+            int majorVersion = inp.read();
+            int minorVersion = inp.read();
+            if (majorVersion == -1 || minorVersion == -1) {
+                pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
+                return null;
+            }
+            return new ID3v2Frame(majorVersion, minorVersion, inp);
+        }
+
+        // Not a frame header
+        pushBack(inp, h1, h2, h3);
+        return null;
+    }
+
+    /**
+     * Pushes bytes back into the stream if possible. This method is called if
+     * no ID3v2 header could be found at the current stream position.
+     * 
+     * @param inp the input stream
+     * @param bytes the bytes to be pushed back
+     * @throws IOException if an error occurs
+     */
+    private static void pushBack(InputStream inp, int... bytes)
+            throws IOException
+    {
+        if (inp instanceof PushbackInputStream)
+        {
+            byte[] buf = new byte[bytes.length];
+            for (int i = 0; i < bytes.length; i++)
+            {
+                buf[i] = (byte) bytes[i];
+            }
+            ((PushbackInputStream) inp).unread(buf);
+        }
+    }
+
+    private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+            throws IOException {
+        this.majorVersion = majorVersion;
+        this.minorVersion = minorVersion;
+
+        // Get the flags and the length
+        flags = inp.read();
+        length = get7BitsInt(readFully(inp, 4), 0);
+
+        // Do we have an extended header?
+        if ((flags & 0x02) == 0x02) {
+            int size = getInt(readFully(inp, 4));
+            extendedHeader = readFully(inp, size);
+        }
+
+        // Get the frame's data, or at least as much
+        //  of it as we could do
+        data = readFully(inp, length, false);
+    }
+
+    protected static int getInt(byte[] data) {
+        return getInt(data, 0);
+    }
+
+    protected static int getInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        int b3 = data[offset+3] & 0xFF;
+        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+    }
+
+    protected static int getInt3(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        return (b0 << 16) + (b1 << 8) + (b2 << 0);
+    }
+
+    protected static int getInt2(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        return (b0 << 8) + (b1 << 0);
+    }
+
+    /**
+     * AKA a Synchsafe integer.
+     * 4 bytes hold a 28 bit number. The highest
+     *  bit in each byte is always 0 and always ignored.
+     */
+    protected static int get7BitsInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0x7F;
+        int b1 = data[offset+1] & 0x7F;
+        int b2 = data[offset+2] & 0x7F;
+        int b3 = data[offset+3] & 0x7F;
+        return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+    }
+
+    protected static byte[] readFully(InputStream inp, int length)
+            throws IOException {
+       return readFully(inp, length, true);
+    }
+    protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
+            throws IOException {
+        byte[] b = new byte[length];
+
+        int pos = 0;
+        int read;
+        while (pos < length) {
+            read = inp.read(b, pos, length-pos);
+            if (read == -1) {
+                if(shortDataIsFatal) {
+                   throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+                } else {
+                   // Give them what we found
+                   // TODO Log the short read
+                   return b;
+                }
+            }
+            pos += read;
+        }
+
+        return b;
+    }
+    
+    protected static class TextEncoding {
+       public final boolean doubleByte;
+       public final String encoding;
+       private TextEncoding(String encoding, boolean doubleByte) {
+          this.doubleByte = doubleByte;
+          this.encoding = encoding;
+       }
+    }
+    protected static final TextEncoding[] encodings = new TextEncoding[] {
+          new TextEncoding("ISO-8859-1", false),
+          new TextEncoding("UTF-16", true), // With BOM
+          new TextEncoding("UTF-16BE", true), // Without BOM
+          new TextEncoding("UTF-8", false)
+    };
+
+    /**
+     * Returns the (possibly null padded) String at the given offset and
+     * length. String encoding is held in the first byte; 
+     */
+    protected static String getTagString(byte[] data, int offset, int length) {
+        int actualLength = length;
+        if (actualLength == 0) {
+            return "";
+        }
+        if (actualLength == 1 && data[offset] == 0) {
+            return "";
+        }
+
+        // Does it have an encoding flag?
+        // Detect by the first byte being sub 0x20
+        TextEncoding encoding = encodings[0];
+        byte maybeEncodingFlag = data[offset];
+        if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
+            offset++;
+            actualLength--;
+            encoding = encodings[maybeEncodingFlag];
+        }
+        
+        // Trim off null termination / padding (as present) 
+        while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
+           actualLength -= 2;
+        } 
+        while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
+           actualLength--;
+        }
+        if (actualLength == 0) {
+           return "";
+        }
+
+        // TIKA-1024: If it's UTF-16 (with BOM) and all we
+        // have is a naked BOM then short-circuit here
+        // (return empty string), because new String(..)
+        // gives different results on different JVMs
+        if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+            ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+             (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+          return "";
+        }
+
+        try {
+            // Build the base string
+            return new String(data, offset, actualLength, encoding.encoding);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(
+                    "Core encoding " + encoding.encoding + " is not available", e);
+        }
+    }
+    /**
+     * Builds up the ID3 comment, by parsing and extracting
+     *  the comment string parts from the given data. 
+     */
+    protected static ID3Comment getComment(byte[] data, int offset, int length) {
+       // Comments must have an encoding
+       int encodingFlag = data[offset];
+       if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+          // Good, valid flag
+       } else {
+          // Invalid string
+          return null;
+       }
+       
+       TextEncoding encoding = encodings[encodingFlag];
+       
+       // First is a 3 byte language
+       String lang = getString(data, offset+1, 3);
+       
+       // After that we have [Desc]\0(\0)[Text]
+       int descStart = offset+4;
+       int textStart = -1;
+       String description = null;
+       String text = null;
+       
+       // Find where the description ends
+       try {
+          for (int i=descStart; i<offset+length; i++) {
+             if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
+                // Handle LE vs BE on low byte text
+                if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
+                   i++;
+                }
+                textStart = i+2;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+             if (!encoding.doubleByte && data[i]==0) {
+                textStart = i+1;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+          }
+          
+          // Did we find the end?
+          if (textStart > -1) {
+             text = new String(data, textStart, offset+length-textStart, encoding.encoding);
+          } else {
+             // Assume everything is the text
+             text = new String(data, descStart, offset+length-descStart, encoding.encoding);
+          }
+          
+          // Return
+          return new ID3Comment(lang, description, text);
+       } catch (UnsupportedEncodingException e) {
+          throw new RuntimeException(
+                  "Core encoding " + encoding.encoding + " is not available", e);
+       }
+    }
+
+    /**
+     * Returns the String at the given
+     *  offset and length. Strings are ISO-8859-1 
+     */
+    protected static String getString(byte[] data, int offset, int length) {
+        return new String(data, offset, length, ISO_8859_1);
+    }
+
+
+    /**
+     * Iterates over id3v2 raw tags.
+     * Create an instance of this that configures the
+     *  various length and multipliers.
+     */
+    protected class RawTagIterator implements Iterator<RawTag> {
+        private int nameLength;
+        private int sizeLength;
+        private int sizeMultiplier;
+        private int flagLength;
+
+        private int offset = 0;
+
+        protected RawTagIterator(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength) {
+            this.nameLength = nameLength;
+            this.sizeLength = sizeLength;
+            this.sizeMultiplier = sizeMultiplier;
+            this.flagLength = flagLength;
+        }
+
+        public boolean hasNext() {
+            // Check for padding at the end
+            return offset < data.length && data[offset] != 0;
+        }
+
+        public RawTag next() {
+            RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+                    flagLength, data, offset);
+            offset += tag.getSize();
+            return tag;
+        }
+
+        public void remove() {
+        }
+
+    }
+
+    protected static class RawTag {
+        private int headerSize;
+        protected String name;
+        protected int flag;
+        protected byte[] data;
+
+        private RawTag(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength, byte[] frameData, int offset) {
+            headerSize = nameLength + sizeLength + flagLength;
+
+            // Name, normally 3 or 4 bytes
+            name = getString(frameData, offset, nameLength);
+
+            // Size
+            int rawSize;
+            if (sizeLength == 3) {
+                rawSize = getInt3(frameData, offset+nameLength);
+            } else {
+                rawSize = getInt(frameData, offset+nameLength);
+            }
+            int size = rawSize * sizeMultiplier;
+
+            // Flag
+            if (flagLength > 0) {
+                if (flagLength == 1) {
+                    flag = (int)frameData[offset+nameLength+sizeLength];
+                } else {
+                    flag = getInt2(frameData, offset+nameLength+sizeLength);
+                }
+            }
+
+            // Now data
+            int copyFrom = offset+nameLength+sizeLength+flagLength;
+            size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+            data = new byte[size];
+            System.arraycopy(frameData, copyFrom, data, 0, size);
+        }
+
+        protected int getSize() {
+            return headerSize + data.length;
+        }
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
index 54b9ae9..12d0f2d 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * This is used to parse Lyrics3 tag information
- *  from an MP3 file, if available.
- * Handles lyrics tags of up to 10kb in size.
- * Will process any ID3v1 tag data if present.
- * Ignores extended ID3v1 data in the lyrics block
- *
- * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
- */
-public class LyricsHandler {
-    boolean foundLyrics = false;
-    String lyricsText = null;
-    ID3v1Handler id3v1 = null;
-
-    public LyricsHandler(InputStream stream, ContentHandler handler)
-            throws IOException, SAXException, TikaException {
-        this(getSuffix(stream, 10240+128));
-    }
-
-    /**
-     * Looks for the Lyrics data, which will be
-     *  just before the ID3v1 data (if present),
-     *  and process it.
-     * Also sets things up for the ID3v1
-     *  processing if required.
-     * Creates from the last 128 bytes of a stream.
-     */
-    protected LyricsHandler(byte[] tagData)
-            throws IOException, SAXException, TikaException {
-        if(tagData.length < 128) {
-            return;
-        }
-
-        // Is there ID3v1 data?
-        byte[] last128 = new byte[128];
-        System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
-        id3v1 = new ID3v1Handler(last128);
-
-        if(tagData.length < 137) {
-            return;
-        }
-
-        // Are there lyrics? Look for the closing Lyrics tag
-        //  at the end to decide if there is any
-        int lookat = tagData.length - 9;
-        if(id3v1.found) {
-            lookat -= 128;
-        }
-        if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' && 
-                tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
-                tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
-                tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
-                tagData[lookat+8] == '0') {
-            foundLyrics = true;
-
-            // The length (6 bytes) comes just before LYRICS200, and is the
-            //  size including the LYRICSBEGIN but excluding the 
-            //  length+LYRICS200 at the end.
-            int length = Integer.parseInt(
-                    new String(tagData, lookat-6, 6, UTF_8)
-            );
-
-            String lyrics = new String(
-                    tagData, lookat-length+5, length-11,
-                    US_ASCII
-            );
-
-            // Tags are a 3 letter code, 5 digit length, then data
-            int pos = 0;
-            while(pos < lyrics.length()-8) {
-                String tagName = lyrics.substring(pos, pos+3);
-                int tagLen = Integer.parseInt(
-                        lyrics.substring(pos+3, pos+8)
-                );
-                int startPos = pos + 8;
-                int endPos = startPos + tagLen;
-
-                if(tagName.equals("LYR")) {
-                    lyricsText = lyrics.substring(startPos, endPos);
-                }
-
-                pos = endPos;
-            }
-        }
-    }
-
-    public boolean hasID3v1() {
-        if(id3v1 == null || id3v1.found == false) {
-            return false;
-        }
-        return true;
-    }
-    public boolean hasLyrics() {
-        return lyricsText != null && lyricsText.length() > 0;
-    }
-
-    /**
-     * Reads and returns the last <code>length</code> bytes from the
-     * given stream.
-     * @param stream input stream
-     * @param length number of bytes from the end to read and return
-     * @return stream the <code>InputStream</code> to read from.
-     * @throws IOException if the stream could not be read from.
-     */
-    protected static byte[] getSuffix(InputStream stream, int length)
-            throws IOException {
-        byte[] buffer = new byte[2 * length];
-        int bytesInBuffer = 0;
-
-        int n = stream.read(buffer);
-        while (n != -1) {
-            bytesInBuffer += n;
-            if (bytesInBuffer == buffer.length) {
-                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
-                bytesInBuffer = length;
-            }
-            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
-        }
-
-        if (bytesInBuffer < length) {
-            length = bytesInBuffer;
-        }
-
-        byte[] result = new byte[length];
-        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
-        return result;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ *  from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+    boolean foundLyrics = false;
+    String lyricsText = null;
+    ID3v1Handler id3v1 = null;
+
+    public LyricsHandler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(getSuffix(stream, 10240+128));
+    }
+
+    /**
+     * Looks for the Lyrics data, which will be
+     *  just before the ID3v1 data (if present),
+     *  and process it.
+     * Also sets things up for the ID3v1
+     *  processing if required.
+     * Creates from the last 128 bytes of a stream.
+     */
+    protected LyricsHandler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if(tagData.length < 128) {
+            return;
+        }
+
+        // Is there ID3v1 data?
+        byte[] last128 = new byte[128];
+        System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+        id3v1 = new ID3v1Handler(last128);
+
+        if(tagData.length < 137) {
+            return;
+        }
+
+        // Are there lyrics? Look for the closing Lyrics tag
+        //  at the end to decide if there is any
+        int lookat = tagData.length - 9;
+        if(id3v1.found) {
+            lookat -= 128;
+        }
+        if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' && 
+                tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+                tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+                tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+                tagData[lookat+8] == '0') {
+            foundLyrics = true;
+
+            // The length (6 bytes) comes just before LYRICS200, and is the
+            //  size including the LYRICSBEGIN but excluding the 
+            //  length+LYRICS200 at the end.
+            int length = Integer.parseInt(
+                    new String(tagData, lookat-6, 6, UTF_8)
+            );
+
+            String lyrics = new String(
+                    tagData, lookat-length+5, length-11,
+                    US_ASCII
+            );
+
+            // Tags are a 3 letter code, 5 digit length, then data
+            int pos = 0;
+            while(pos < lyrics.length()-8) {
+                String tagName = lyrics.substring(pos, pos+3);
+                int tagLen = Integer.parseInt(
+                        lyrics.substring(pos+3, pos+8)
+                );
+                int startPos = pos + 8;
+                int endPos = startPos + tagLen;
+
+                if(tagName.equals("LYR")) {
+                    lyricsText = lyrics.substring(startPos, endPos);
+                }
+
+                pos = endPos;
+            }
+        }
+    }
+
+    public boolean hasID3v1() {
+        if(id3v1 == null || id3v1.found == false) {
+            return false;
+        }
+        return true;
+    }
+    public boolean hasLyrics() {
+        return lyricsText != null && lyricsText.length() > 0;
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+    protected static byte[] getSuffix(InputStream stream, int length)
+            throws IOException {
+        byte[] buffer = new byte[2 * length];
+        int bytesInBuffer = 0;
+
+        int n = stream.read(buffer);
+        while (n != -1) {
+            bytesInBuffer += n;
+            if (bytesInBuffer == buffer.length) {
+                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+                bytesInBuffer = length;
+            }
+            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+        }
+
+        if (bytesInBuffer < length) {
+            length = bytesInBuffer;
+        }
+
+        byte[] result = new byte[length];
+        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+        return result;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
index a88265f..923be8a 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
@@ -1,25 +1,25 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-
-/**
- * A frame in an MP3 file, such as ID3v2 Tags or some
- *  audio.
- */
-public interface MP3Frame {
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ *  audio.
+ */
+public interface MP3Frame {
+}

[26/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
index 226a447..97eaf46 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
@@ -1,176 +1,176 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Description Note: not always exists An index chunk has the following format:
- * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
- * directory chunk 0008: Directory index entries (to quickref/free area) The
- * quickref area in an PMGI is the same as in an PMGL The format of a directory
- * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
- * ENCINT: directory listing chunk which starts with name Encoded Integers aka
- * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
- * indicates "continued to the next byte". Bytes are stored most significant to
- * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
- * 0x3515.
- * 
- * <p>
- * Note: This class is not in use
- * 
- * {@link http://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original/?show-translation-form=1 }
- * 
- * 
- */
-public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
-    private static final long serialVersionUID = -2092282339894303701L;
-    private byte[] signature;
-    private long free_space; /* 4 */
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    public ChmPmgiHeader() {
-        signature = ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8); /* 0 (PMGI) */
-    }
-
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
-            int count) throws ChmParsingException {
-        int index = -1;
-        ChmAssert.assertByteArrayNotNull(data);
-        ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
-        ChmAssert.assertPositiveInt(count);
-        this.setDataRemained(data.length);
-            index = ChmCommons.indexOf(data,
-                    ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
-
-        if (index >= 0)
-            System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
-        else{
-            //Some chm documents (actually most of them) do not contain
-            //PMGI header, in this case, we just notice about it.
-        }
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-    }
-
-    private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
-        ChmAssert.assertByteArrayNotNull(data);
-
-        if (4 > getDataRemained())
-            throw new ChmParsingException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    /**
-     * Returns pmgi signature if exists
-     * 
-     * @return signature
-     */
-    public byte[] getSignature() {
-        return signature;
-    }
-
-    /**
-     * Sets pmgi signature
-     * 
-     * @param signature
-     */
-    protected void setSignature(byte[] signature) {
-        this.signature = signature;
-    }
-
-    /**
-     * Returns pmgi free space
-     * 
-     * @return free_space
-     */
-    public long getFreeSpace() {
-        return free_space;
-    }
-
-    /**
-     * Sets pmgi free space
-     * 
-     * @param free_space
-     */
-    protected void setFreeSpace(long free_space) {
-        this.free_space = free_space;
-    }
-
-    /**
-     * Returns textual representation of the pmgi header
-     */
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("signature:=" + new String(getSignature(), UTF_8) + ", ");
-        sb.append("free space:=" + getFreeSpace()
-                + System.getProperty("line.separator"));
-        return sb.toString();
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException {
-        /* we only know how to deal with a 0x8 byte structures */
-        if (data.length < ChmConstants.CHM_PMGI_LEN)
-            throw new TikaException("we only know how to deal with a 0x8 byte structures");
-
-        /* unmarshal fields */
-        chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN);
-        chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace()));
-
-        /* check structure */
-        if (!Arrays.equals(chmPmgiHeader.getSignature(),
-                ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8)))
-            throw new TikaException(
-                    "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
-
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Description Note: not always exists An index chunk has the following format:
+ * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
+ * directory chunk 0008: Directory index entries (to quickref/free area) The
+ * quickref area in an PMGI is the same as in an PMGL The format of a directory
+ * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
+ * ENCINT: directory listing chunk which starts with name Encoded Integers aka
+ * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
+ * indicates "continued to the next byte". Bytes are stored most significant to
+ * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
+ * 0x3515.
+ * 
+ * <p>
+ * Note: This class is not in use
+ * 
+ * {@link http://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original/?show-translation-form=1 }
+ * 
+ * 
+ */
+public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
+    private static final long serialVersionUID = -2092282339894303701L;
+    private byte[] signature;
+    private long free_space; /* 4 */
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    public ChmPmgiHeader() {
+        signature = ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8); /* 0 (PMGI) */
+    }
+
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
+            int count) throws ChmParsingException {
+        int index = -1;
+        ChmAssert.assertByteArrayNotNull(data);
+        ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
+        ChmAssert.assertPositiveInt(count);
+        this.setDataRemained(data.length);
+            index = ChmCommons.indexOf(data,
+                    ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
+
+        if (index >= 0)
+            System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
+        else{
+            //Some chm documents (actually most of them) do not contain
+            //PMGI header, in this case, we just notice about it.
+        }
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+    }
+
+    private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
+        ChmAssert.assertByteArrayNotNull(data);
+
+        if (4 > getDataRemained())
+            throw new ChmParsingException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    /**
+     * Returns pmgi signature if exists
+     * 
+     * @return signature
+     */
+    public byte[] getSignature() {
+        return signature;
+    }
+
+    /**
+     * Sets pmgi signature
+     * 
+     * @param signature
+     */
+    protected void setSignature(byte[] signature) {
+        this.signature = signature;
+    }
+
+    /**
+     * Returns pmgi free space
+     * 
+     * @return free_space
+     */
+    public long getFreeSpace() {
+        return free_space;
+    }
+
+    /**
+     * Sets pmgi free space
+     * 
+     * @param free_space
+     */
+    protected void setFreeSpace(long free_space) {
+        this.free_space = free_space;
+    }
+
+    /**
+     * Returns textual representation of the pmgi header
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("signature:=" + new String(getSignature(), UTF_8) + ", ");
+        sb.append("free space:=" + getFreeSpace()
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException {
+        /* we only know how to deal with a 0x8 byte structures */
+        if (data.length < ChmConstants.CHM_PMGI_LEN)
+            throw new TikaException("we only know how to deal with a 0x8 byte structures");
+
+        /* unmarshal fields */
+        chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN);
+        chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace()));
+
+        /* check structure */
+        if (!Arrays.equals(chmPmgiHeader.getSignature(),
+                ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8)))
+            throw new TikaException(
+                    "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
index 7c8a5cd..abb7175 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
@@ -1,206 +1,206 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Description There are two types of directory chunks -- index chunks, and
- * listing chunks. The index chunk will be omitted if there is only one listing
- * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
- * DWORD Length of free space and/or quickref area at end of directory chunk
- * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
- * reading directory in sequence (-1 if this is the first listing chunk) 0010:
- * DWORD Chunk number of next listing chunk when reading directory in sequence
- * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
- * quickref area) Sorted by filename; the sort is case-insensitive The quickref
- * area is written backwards from the end of the chunk. One quickref entry
- * exists for every n entries in the file, where n is calculated as 1 + (1 <<
- * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
- * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
- * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
- * Offset of entry 3n from entry 0 ... The format of a directory listing entry
- * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
- * content section ENCINT: offset ENCINT: length The offset is from the
- * beginning of the content section the file is in, after the section has been
- * decompressed (if appropriate). The length also refers to length of the file
- * in the section after decompression. There are two kinds of file represented
- * in the directory: user data and format related files. The files which are
- * format-related have names which begin with '::', the user data files have
- * names which begin with "/".
- * 
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1 }
- * 
- * @author olegt
- * 
- */
-public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
-    private static final long serialVersionUID = -6139486487475923593L;
-    private byte[] signature;
-    private long free_space; /* 4 */
-    private long unknown_0008; /* 8 */
-    private int block_prev; /* c */
-    private int block_next; /* 10 */
-
-    /* local usage */
-    private int dataRemained;
-    private int currentPlace = 0;
-
-    public ChmPmglHeader() {
-            signature = ChmConstants.PMGL.getBytes(UTF_8); /*
-                                                            * 0
-                                                            * (PMGL
-                                                            * )
-                                                            */
-    }
-
-    private int getDataRemained() {
-        return dataRemained;
-    }
-
-    private void setDataRemained(int dataRemained) {
-        this.dataRemained = dataRemained;
-    }
-
-    private int getCurrentPlace() {
-        return currentPlace;
-    }
-
-    private void setCurrentPlace(int currentPlace) {
-        this.currentPlace = currentPlace;
-    }
-
-    public long getFreeSpace() {
-        return free_space;
-    }
-
-    public void setFreeSpace(long free_space) throws TikaException {
-        if (free_space < 0) {
-            throw new TikaException("Bad PMGLheader.FreeSpace="+free_space);
-        }
-        this.free_space = free_space;
-    }
-
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", ");
-        sb.append("free space:=" + getFreeSpace() + ", ");
-        sb.append("unknown0008:=" + getUnknown0008() + ", ");
-        sb.append("prev block:=" + getBlockPrev() + ", ");
-        sb.append("next block:=" + getBlockNext()
-                + System.getProperty("line.separator"));
-        return sb.toString();
-    }
-
-    protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
-            int count) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        this.setDataRemained(data.length);
-        System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
-        this.setCurrentPlace(this.getCurrentPlace() + count);
-        this.setDataRemained(this.getDataRemained() - count);
-    }
-
-    private int unmarshalInt32(byte[] data) throws TikaException {
-        ChmAssert.assertByteArrayNotNull(data);
-        int dest;
-        if (4 > this.getDataRemained())
-            throw new TikaException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        this.setDataRemained(this.getDataRemained() - 4);
-        return dest;
-    }
-
-    private long unmarshalUInt32(byte[] data) throws ChmParsingException {
-        ChmAssert.assertByteArrayNotNull(data);
-        long dest;
-        if (4 > getDataRemained())
-            throw new ChmParsingException("4 > dataLenght");
-        dest = (data[this.getCurrentPlace()] & 0xff)
-                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
-                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
-                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
-        setDataRemained(this.getDataRemained() - 4);
-        this.setCurrentPlace(this.getCurrentPlace() + 4);
-        return dest;
-    }
-
-    // @Override
-    public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
-        if (data.length < ChmConstants.CHM_PMGL_LEN)
-            throw new TikaException(ChmPmglHeader.class.getName()
-                    + " we only know how to deal with a 0x14 byte structures");
-
-        /* unmarshal fields */
-        chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
-                ChmConstants.CHM_SIGNATURE_LEN);
-        chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data));
-        chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data));
-        chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data));
-        chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
-
-        /* check structure */
-        if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL))
-            throw new ChmParsingException(ChmPmglHeader.class.getName()
-                    + " pmgl != pmgl.signature");
-    }
-
-    public byte[] getSignature() {
-        return signature;
-    }
-
-    protected void setSignature(byte[] signature) {
-        this.signature = signature;
-    }
-
-    public long getUnknown0008() {
-        return unknown_0008;
-    }
-
-    protected void setUnknown0008(long unknown_0008) {
-        this.unknown_0008 = unknown_0008;
-    }
-
-    public int getBlockPrev() {
-        return block_prev;
-    }
-
-    protected void setBlockPrev(int block_prev) {
-        this.block_prev = block_prev;
-    }
-
-    public int getBlockNext() {
-        return block_next;
-    }
-
-    protected void setBlockNext(int block_next) {
-        this.block_next = block_next;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Description There are two types of directory chunks -- index chunks, and
+ * listing chunks. The index chunk will be omitted if there is only one listing
+ * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
+ * DWORD Length of free space and/or quickref area at end of directory chunk
+ * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
+ * reading directory in sequence (-1 if this is the first listing chunk) 0010:
+ * DWORD Chunk number of next listing chunk when reading directory in sequence
+ * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
+ * quickref area) Sorted by filename; the sort is case-insensitive The quickref
+ * area is written backwards from the end of the chunk. One quickref entry
+ * exists for every n entries in the file, where n is calculated as 1 + (1 <<
+ * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
+ * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
+ * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
+ * Offset of entry 3n from entry 0 ... The format of a directory listing entry
+ * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
+ * content section ENCINT: offset ENCINT: length The offset is from the
+ * beginning of the content section the file is in, after the section has been
+ * decompressed (if appropriate). The length also refers to length of the file
+ * in the section after decompression. There are two kinds of file represented
+ * in the directory: user data and format related files. The files which are
+ * format-related have names which begin with '::', the user data files have
+ * names which begin with "/".
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ * 
+ * @author olegt
+ * 
+ */
+public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
+    private static final long serialVersionUID = -6139486487475923593L;
+    private byte[] signature;
+    private long free_space; /* 4 */
+    private long unknown_0008; /* 8 */
+    private int block_prev; /* c */
+    private int block_next; /* 10 */
+
+    /* local usage */
+    private int dataRemained;
+    private int currentPlace = 0;
+
+    public ChmPmglHeader() {
+            signature = ChmConstants.PMGL.getBytes(UTF_8); /*
+                                                            * 0
+                                                            * (PMGL
+                                                            * )
+                                                            */
+    }
+
+    private int getDataRemained() {
+        return dataRemained;
+    }
+
+    private void setDataRemained(int dataRemained) {
+        this.dataRemained = dataRemained;
+    }
+
+    private int getCurrentPlace() {
+        return currentPlace;
+    }
+
+    private void setCurrentPlace(int currentPlace) {
+        this.currentPlace = currentPlace;
+    }
+
+    public long getFreeSpace() {
+        return free_space;
+    }
+
+    public void setFreeSpace(long free_space) throws TikaException {
+        if (free_space < 0) {
+            throw new TikaException("Bad PMGLheader.FreeSpace="+free_space);
+        }
+        this.free_space = free_space;
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", ");
+        sb.append("free space:=" + getFreeSpace() + ", ");
+        sb.append("unknown0008:=" + getUnknown0008() + ", ");
+        sb.append("prev block:=" + getBlockPrev() + ", ");
+        sb.append("next block:=" + getBlockNext()
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
+            int count) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        this.setDataRemained(data.length);
+        System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
+        this.setCurrentPlace(this.getCurrentPlace() + count);
+        this.setDataRemained(this.getDataRemained() - count);
+    }
+
+    private int unmarshalInt32(byte[] data) throws TikaException {
+        ChmAssert.assertByteArrayNotNull(data);
+        int dest;
+        if (4 > this.getDataRemained())
+            throw new TikaException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        this.setDataRemained(this.getDataRemained() - 4);
+        return dest;
+    }
+
+    private long unmarshalUInt32(byte[] data) throws ChmParsingException {
+        ChmAssert.assertByteArrayNotNull(data);
+        long dest;
+        if (4 > getDataRemained())
+            throw new ChmParsingException("4 > dataLenght");
+        dest = (data[this.getCurrentPlace()] & 0xff)
+                | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+                | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+                | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+        setDataRemained(this.getDataRemained() - 4);
+        this.setCurrentPlace(this.getCurrentPlace() + 4);
+        return dest;
+    }
+
+    // @Override
+    public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
+        if (data.length < ChmConstants.CHM_PMGL_LEN)
+            throw new TikaException(ChmPmglHeader.class.getName()
+                    + " we only know how to deal with a 0x14 byte structures");
+
+        /* unmarshal fields */
+        chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
+                ChmConstants.CHM_SIGNATURE_LEN);
+        chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data));
+        chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data));
+        chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data));
+        chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
+
+        /* check structure */
+        if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL))
+            throw new ChmParsingException(ChmPmglHeader.class.getName()
+                    + " pmgl != pmgl.signature");
+    }
+
+    public byte[] getSignature() {
+        return signature;
+    }
+
+    protected void setSignature(byte[] signature) {
+        this.signature = signature;
+    }
+
+    public long getUnknown0008() {
+        return unknown_0008;
+    }
+
+    protected void setUnknown0008(long unknown_0008) {
+        this.unknown_0008 = unknown_0008;
+    }
+
+    public int getBlockPrev() {
+        return block_prev;
+    }
+
+    protected void setBlockPrev(int block_prev) {
+        this.block_prev = block_prev;
+    }
+
+    public int getBlockNext() {
+        return block_next;
+    }
+
+    protected void setBlockNext(int block_next) {
+        this.block_next = block_next;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
index 05aa411..c413e07 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
@@ -1,151 +1,151 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-
-/**
- * The format of a directory listing entry is as follows: BYTE: length of name
- * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
- * length The offset is from the beginning of the content section the file is
- * in, after the section has been decompressed (if appropriate). The length also
- * refers to length of the file in the section after decompression. There are
- * two kinds of file represented in the directory: user data and format related
- * files. The files which are format-related have names which begin with '::',
- * the user data files have names which begin with "/".
- * 
- */
-public class DirectoryListingEntry {
-    /* Length of the entry name */
-    private int name_length;
-    /* Entry name or directory name */
-    private String name;
-    /* Entry type */
-    private ChmCommons.EntryType entryType;
-    /* Entry offset */
-    private int offset;
-    /* Entry size */
-    private int length;
-
-    public DirectoryListingEntry() {
-
-    }
-
-    /**
-     * Constructs directoryListingEntry
-     * 
-     * @param name_length
-     *            int
-     * @param name
-     *            String
-     * @param isCompressed
-     *            ChmCommons.EntryType
-     * @param offset
-     *            int
-     * @param length
-     *            int
-     * @throws TikaException 
-     */
-    public DirectoryListingEntry(int name_length, String name,
-            ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException {
-        ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length);
-        setNameLength(name_length);
-        setName(name);
-        setEntryType(isCompressed);
-        setOffset(offset);
-        setLength(length);
-    }
-
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator"));
-        sb.append("name:=" + getName() + System.getProperty("line.separator"));
-        sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator"));
-        sb.append("offset:=" + getOffset() + System.getProperty("line.separator"));
-        sb.append("length:=" + getLength());
-        return sb.toString();
-    }
-    
-    /**
-     * Returns an entry name length
-     * 
-     * @return int
-     */
-    public int getNameLength() {
-        return name_length;
-    }
-
-    /**
-     * Sets an entry name length
-     * 
-     * @param name_length
-     *            int
-     */
-    protected void setNameLength(int name_length) {
-        this.name_length = name_length;
-    }
-
-    /**
-     * Returns an entry name
-     * 
-     * @return String
-     */
-    public String getName() {
-        return name;
-    }
-
-    /**
-     * Sets entry name
-     * 
-     * @param name
-     *            String
-     */
-    protected void setName(String name) {
-        this.name = name;
-    }
-
-    /**
-     * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
-     * 
-     * @return ChmCommons.EntryType
-     */
-    public ChmCommons.EntryType getEntryType() {
-        return entryType;
-    }
-
-    protected void setEntryType(ChmCommons.EntryType entryType) {
-        this.entryType = entryType;
-    }
-
-    public int getOffset() {
-        return offset;
-    }
-
-    protected void setOffset(int offset) {
-        this.offset = offset;
-    }
-
-    public int getLength() {
-        return length;
-    }
-
-    protected void setLength(int length) {
-        this.length = length;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+/**
+ * The format of a directory listing entry is as follows: BYTE: length of name
+ * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
+ * length The offset is from the beginning of the content section the file is
+ * in, after the section has been decompressed (if appropriate). The length also
+ * refers to length of the file in the section after decompression. There are
+ * two kinds of file represented in the directory: user data and format related
+ * files. The files which are format-related have names which begin with '::',
+ * the user data files have names which begin with "/".
+ * 
+ */
+public class DirectoryListingEntry {
+    /* Length of the entry name */
+    private int name_length;
+    /* Entry name or directory name */
+    private String name;
+    /* Entry type */
+    private ChmCommons.EntryType entryType;
+    /* Entry offset */
+    private int offset;
+    /* Entry size */
+    private int length;
+
+    public DirectoryListingEntry() {
+
+    }
+
+    /**
+     * Constructs directoryListingEntry
+     * 
+     * @param name_length
+     *            int
+     * @param name
+     *            String
+     * @param isCompressed
+     *            ChmCommons.EntryType
+     * @param offset
+     *            int
+     * @param length
+     *            int
+     * @throws TikaException 
+     */
+    public DirectoryListingEntry(int name_length, String name,
+            ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException {
+        ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length);
+        setNameLength(name_length);
+        setName(name);
+        setEntryType(isCompressed);
+        setOffset(offset);
+        setLength(length);
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator"));
+        sb.append("name:=" + getName() + System.getProperty("line.separator"));
+        sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator"));
+        sb.append("offset:=" + getOffset() + System.getProperty("line.separator"));
+        sb.append("length:=" + getLength());
+        return sb.toString();
+    }
+    
+    /**
+     * Returns an entry name length
+     * 
+     * @return int
+     */
+    public int getNameLength() {
+        return name_length;
+    }
+
+    /**
+     * Sets an entry name length
+     * 
+     * @param name_length
+     *            int
+     */
+    protected void setNameLength(int name_length) {
+        this.name_length = name_length;
+    }
+
+    /**
+     * Returns an entry name
+     * 
+     * @return String
+     */
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * Sets entry name
+     * 
+     * @param name
+     *            String
+     */
+    protected void setName(String name) {
+        this.name = name;
+    }
+
+    /**
+     * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
+     * 
+     * @return ChmCommons.EntryType
+     */
+    public ChmCommons.EntryType getEntryType() {
+        return entryType;
+    }
+
+    protected void setEntryType(ChmCommons.EntryType entryType) {
+        this.entryType = entryType;
+    }
+
+    public int getOffset() {
+        return offset;
+    }
+
+    protected void setOffset(int offset) {
+        this.offset = offset;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    protected void setLength(int length) {
+        this.length = length;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
index a332690..cdedc3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
@@ -1,169 +1,169 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.assertion;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmAccessor;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * Contains chm extractor assertions
- */
-public class ChmAssert {
-    /**
-     * Checks a validity of the chmBlockSegment parameters
-     * 
-     * @param data
-     *            byte[]
-     * @param resetTable
-     *            ChmLzxcResetTable
-     * @param blockNumber
-     *            int
-     * @param lzxcBlockOffset
-     *            int
-     * @param lzxcBlockLength
-     *            int
-     * @throws TikaException 
-     */
-    public static final void assertChmBlockSegment(byte[] data,
-            ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
-            int lzxcBlockLength) throws TikaException {
-        if ((data == null))
-            throw new TikaException("data[] is null");
-
-        if ((data.length <= 0))
-            throw new TikaException("data[] length should be greater than zero");
-
-        if (resetTable == null)
-            throw new TikaException("resetTable is null");
-
-        if (resetTable.getBlockAddress().length <= 1)
-            throw new TikaException("resetTable.getBlockAddress().length should be greater than zero");
-
-        if (blockNumber < 0)
-            throw new TikaException("blockNumber should be positive number");
-
-        if (lzxcBlockOffset < 0)
-            throw new TikaException("lzxcBlockOffset should be positive number");
-
-        if (lzxcBlockLength < 0)
-            throw new TikaException("lzxcBlockLength should be positive number");
-    }
-
-    /**
-     * Checks if InputStream is not null
-     * 
-     * @param is
-     *            InputStream
-     * @throws ChmParsingException 
-     * @throws IOException 
-     */
-    public static final void assertInputStreamNotNull(InputStream is) throws IOException {
-        if (is == null)
-            throw new IOException("input sream is null");
-    }
-
-    /**
-     * Checks validity of ChmAccessor parameters
-     * 
-     * @param data
-     * @param chmItsfHeader
-     * @param count
-     * @throws ChmParsingException 
-     */
-    public static final void assertChmAccessorParameters(byte[] data,
-            ChmAccessor<?> chmAccessor, int count) throws ChmParsingException {
-        assertByteArrayNotNull(data);
-        assertChmAccessorNotNull(chmAccessor);
-    }
-
-    /**
-     * Checks if byte[] is not null
-     * 
-     * @param data
-     * @throws ChmParsingException 
-     */
-    public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException {
-        if (data == null)
-            throw new ChmParsingException("byte[] data is null");
-    }
-
-    /**
-     * Checks if ChmAccessor is not null In case of null throws exception
-     * 
-     * @param ChmAccessor
-     * @throws ChmParsingException 
-     */
-    public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException {
-        if (chmAccessor == null)
-            throw new ChmParsingException("chm header is null");
-    }
-
-    /**
-     * Checks validity of the DirectoryListingEntry's parameters In case of
-     * invalid parameter(s) throws an exception
-     * 
-     * @param name_length
-     *            length of the chm entry name
-     * @param name
-     *            chm entry name
-     * @param entryType
-     *            EntryType
-     * @param offset
-     * @param length
-     * @throws ChmParsingException 
-     */
-    public static final void assertDirectoryListingEntry(int name_length,
-            String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException {
-        if (name_length < 0)
-            throw new ChmParsingException("invalid name length");
-        if (name == null)
-            throw new ChmParsingException("invalid name");
-
-        if ((entryType != ChmCommons.EntryType.COMPRESSED)
-                && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
-            throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
-
-        if (offset < 0)
-            throw new ChmParsingException("invalid offset");
-
-        if (length < 0)
-            throw new ChmParsingException("invalid length");
-    }
-
-    public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException {
-        if (index >= dataLength)
-            throw new ChmParsingException("cannot parse chm file index > data.length");
-    }
-
-    /**
-     * Checks if int param is greater than zero In case param <=0 throws an
-     * exception
-     * 
-     * @param param
-     * @throws ChmParsingException 
-     */
-    public static void assertPositiveInt(int param) throws ChmParsingException {
-        if (param <= 0)
-            throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero");
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.assertion;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmAccessor;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Contains chm extractor assertions
+ */
+public class ChmAssert {
+    /**
+     * Checks a validity of the chmBlockSegment parameters
+     * 
+     * @param data
+     *            byte[]
+     * @param resetTable
+     *            ChmLzxcResetTable
+     * @param blockNumber
+     *            int
+     * @param lzxcBlockOffset
+     *            int
+     * @param lzxcBlockLength
+     *            int
+     * @throws TikaException 
+     */
+    public static final void assertChmBlockSegment(byte[] data,
+            ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+            int lzxcBlockLength) throws TikaException {
+        if ((data == null))
+            throw new TikaException("data[] is null");
+
+        if ((data.length <= 0))
+            throw new TikaException("data[] length should be greater than zero");
+
+        if (resetTable == null)
+            throw new TikaException("resetTable is null");
+
+        if (resetTable.getBlockAddress().length <= 1)
+            throw new TikaException("resetTable.getBlockAddress().length should be greater than zero");
+
+        if (blockNumber < 0)
+            throw new TikaException("blockNumber should be positive number");
+
+        if (lzxcBlockOffset < 0)
+            throw new TikaException("lzxcBlockOffset should be positive number");
+
+        if (lzxcBlockLength < 0)
+            throw new TikaException("lzxcBlockLength should be positive number");
+    }
+
+    /**
+     * Checks if InputStream is not null
+     * 
+     * @param is
+     *            InputStream
+     * @throws ChmParsingException 
+     * @throws IOException 
+     */
+    public static final void assertInputStreamNotNull(InputStream is) throws IOException {
+        if (is == null)
+            throw new IOException("input sream is null");
+    }
+
+    /**
+     * Checks validity of ChmAccessor parameters
+     * 
+     * @param data
+     * @param chmItsfHeader
+     * @param count
+     * @throws ChmParsingException 
+     */
+    public static final void assertChmAccessorParameters(byte[] data,
+            ChmAccessor<?> chmAccessor, int count) throws ChmParsingException {
+        assertByteArrayNotNull(data);
+        assertChmAccessorNotNull(chmAccessor);
+    }
+
+    /**
+     * Checks if byte[] is not null
+     * 
+     * @param data
+     * @throws ChmParsingException 
+     */
+    public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException {
+        if (data == null)
+            throw new ChmParsingException("byte[] data is null");
+    }
+
+    /**
+     * Checks if ChmAccessor is not null In case of null throws exception
+     * 
+     * @param ChmAccessor
+     * @throws ChmParsingException 
+     */
+    public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException {
+        if (chmAccessor == null)
+            throw new ChmParsingException("chm header is null");
+    }
+
+    /**
+     * Checks validity of the DirectoryListingEntry's parameters In case of
+     * invalid parameter(s) throws an exception
+     * 
+     * @param name_length
+     *            length of the chm entry name
+     * @param name
+     *            chm entry name
+     * @param entryType
+     *            EntryType
+     * @param offset
+     * @param length
+     * @throws ChmParsingException 
+     */
+    public static final void assertDirectoryListingEntry(int name_length,
+            String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException {
+        if (name_length < 0)
+            throw new ChmParsingException("invalid name length");
+        if (name == null)
+            throw new ChmParsingException("invalid name");
+
+        if ((entryType != ChmCommons.EntryType.COMPRESSED)
+                && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
+            throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
+
+        if (offset < 0)
+            throw new ChmParsingException("invalid offset");
+
+        if (length < 0)
+            throw new ChmParsingException("invalid length");
+    }
+
+    public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException {
+        if (index >= dataLength)
+            throw new ChmParsingException("cannot parse chm file index > data.length");
+    }
+
+    /**
+     * Checks if int param is greater than zero In case param <=0 throws an
+     * exception
+     * 
+     * @param param
+     * @throws ChmParsingException 
+     */
+    public static void assertPositiveInt(int param) throws ChmParsingException {
+        if (param <= 0)
+            throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero");
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
index a7fdf60..cded7f2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
@@ -1,361 +1,361 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-public class ChmCommons {
-    /* Prevents initialization */
-    private ChmCommons() {
-    }
-
-    public static void assertByteArrayNotNull(byte[] data) throws TikaException {
-        if (data == null)
-            throw new TikaException("byte[] is null");
-    }
-
-    /**
-     * Represents entry types: uncompressed, compressed
-     */
-    public enum EntryType {
-        UNCOMPRESSED, COMPRESSED
-    }
-
-    /**
-     * Represents lzx states: started decoding, not started decoding
-     */
-    public enum LzxState {
-        STARTED_DECODING, NOT_STARTED_DECODING
-    }
-
-    /**
-     * Represents intel file states during decompression
-     */
-    public enum IntelState {
-        STARTED, NOT_STARTED
-    }
-
-    /**
-     * Represents lzx block types in order to decompress differently
-     */
-    public final static int UNDEFINED = 0;
-    public final static int VERBATIM = 1;
-    public final static int ALIGNED_OFFSET = 2;
-    public final static int UNCOMPRESSED = 3;
-
-    /**
-     * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
-     * i.e 2^X
-     * 
-     * @param window
-     *            chmLzxControlData.getWindowSize()
-     * 
-     * @return window size
-     */
-    public static int getWindowSize(int window) {
-        int win = 0;
-        while (window > 1) {
-            window >>>= 1;
-            win++;
-        }
-        return win;
-    }
-
-    public static byte[] getChmBlockSegment(byte[] data,
-            ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
-            int lzxcBlockLength) throws TikaException {
-        ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
-                lzxcBlockOffset, lzxcBlockLength);
-        int blockLength = -1;
-        // TODO add int_max_value checking
-        if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
-            blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
-                    .getBlockAddress()[blockNumber]);
-        } else {
-            /* new code */
-            if (blockNumber >= resetTable.getBlockAddress().length)
-                blockLength = 0;
-            else
-                /* end new code */
-                blockLength = (int) (lzxcBlockLength - resetTable
-                        .getBlockAddress()[blockNumber]);
-        }
-        byte[] t = ChmCommons
-                .copyOfRange(
-                        data,
-                        (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
-                        (int) (lzxcBlockOffset
-                                + resetTable.getBlockAddress()[blockNumber] + blockLength));
-        return (t != null) ? t : new byte[1];
-    }
-
-    /**
-     * Returns textual representation of LangID
-     * 
-     * @param langID
-     * 
-     * @return language name
-     */
-    public static String getLanguage(long langID) {
-        /* Potential problem with casting */
-        switch ((int) langID) {
-        case 1025:
-            return "Arabic";
-        case 1069:
-            return "Basque";
-        case 1027:
-            return "Catalan";
-        case 2052:
-            return "Chinese (Simplified)";
-        case 1028:
-            return "Chinese (Traditional)";
-        case 1029:
-            return "Czech";
-        case 1030:
-            return "Danish";
-        case 1043:
-            return "Dutch";
-        case 1033:
-            return "English (United States)";
-        case 1035:
-            return "Finnish";
-        case 1036:
-            return "French";
-        case 1031:
-            return "German";
-        case 1032:
-            return "Greek";
-        case 1037:
-            return "Hebrew";
-        case 1038:
-            return "Hungarian";
-        case 1040:
-            return "Italian";
-        case 1041:
-            return "Japanese";
-        case 1042:
-            return "Korean";
-        case 1044:
-            return "Norwegian";
-        case 1045:
-            return "Polish";
-        case 2070:
-            return "Portuguese";
-        case 1046:
-            return "Portuguese (Brazil)";
-        case 1049:
-            return "Russian";
-        case 1051:
-            return "Slovakian";
-        case 1060:
-            return "Slovenian";
-        case 3082:
-            return "Spanish";
-        case 1053:
-            return "Swedish";
-        case 1055:
-            return "Turkish";
-        default:
-            return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
-        }
-    }
-
-    /**
-     * Checks skippable patterns
-     * 
-     * @param directoryListingEntry
-     * 
-     * @return boolean
-     */
-    public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
-        return (directoryListingEntry.getName().startsWith("/$")
-                || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
-                .getName().startsWith("::")) ? true : false;
-    }
-
-    /**
-     * Writes byte[][] to the file
-     * 
-     * @param buffer
-     * @param fileToBeSaved
-     *            file name
-     * @throws TikaException 
-     */
-    public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException {
-        FileOutputStream output = null;
-        if (buffer != null && fileToBeSaved != null
-                && !ChmCommons.isEmpty(fileToBeSaved)) {
-            try {
-                output = new FileOutputStream(fileToBeSaved);
-                for (byte[] bufferEntry : buffer) {
-                    output.write(bufferEntry);
-                }
-            } catch (FileNotFoundException e) {
-                throw new TikaException(e.getMessage());
-            } catch (IOException e) {
-                e.printStackTrace();
-            } finally {
-                if (output != null)
-                    try {
-                        output.flush();
-                        output.close();
-                    } catch (IOException e) {
-                        e.printStackTrace();
-                    }
-            }
-        }
-    }
-
-    /**
-     * Reverses the order of given array
-     * 
-     * @param array
-     */
-    public static void reverse(byte[] array) {
-        if (array == null) {
-            return;
-        }
-        int i = 0;
-        int j = array.length - 1;
-        byte tmp;
-        while (j > i) {
-            tmp = array[j];
-            array[j] = array[i];
-            array[i] = tmp;
-            j--;
-            i++;
-        }
-    }
-
-    /**
-     * Returns an index of the reset table
-     * 
-     * @param text
-     * @param pattern
-     * @return index of the reset table
-     * @throws ChmParsingException 
-     */
-    public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException {
-        return (indexOf(text, pattern)) - 4;
-    }
-
-    /**
-     * Searches some pattern in byte[]
-     * 
-     * @param text
-     *            byte[]
-     * @param pattern
-     *            byte[]
-     * @return an index, if nothing found returns -1
-     * @throws ChmParsingException 
-     */
-    public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
-        int[] next = null;
-        int i = 0, j = -1;
-
-        /* Preprocessing */
-        if (pattern != null && text != null) {
-            next = new int[pattern.length];
-            next[0] = -1;
-        } else
-            throw new ChmParsingException("pattern and/or text should not be null");
-
-        /* Computes a failure function */
-        while (i < pattern.length - 1) {
-            if (j == -1 || pattern[i] == pattern[j]) {
-                i++;
-                j++;
-                if (pattern[i] != pattern[j])
-                    next[i] = j;
-                else
-                    next[i] = next[j];
-            } else
-                j = next[j];
-        }
-
-        /* Reinitializes local variables */
-        i = j = 0;
-
-        /* Matching */
-        while (i < text.length && j < pattern.length) {
-            if (j == -1 || pattern[j] == text[i]) {
-                i++;
-                j++;
-            } else
-                j = next[j];
-        }
-        if (j == pattern.length)
-            return (i - j); // match found at offset i - M
-        else
-            return -1; // not found
-    }
-
-    /**
-     * Searches for some pattern in the directory listing entry list
-     * 
-     * @param list
-     * @param pattern
-     * @return an index, if nothing found returns -1
-     */
-    public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
-        int place = 0;
-        for (DirectoryListingEntry directoryListingEntry : list) {
-            if (directoryListingEntry.toString().contains(pattern)) return place;
-            ++place;
-        }
-        return -1;// not found
-    }
-
-    /*
-     * This method is added because of supporting of Java 5
-     */
-    public static byte[] copyOfRange(byte[] original, int from, int to) {
-        checkCopyOfRangeParams(original, from, to);
-        int newLength = to - from;
-        if (newLength < 0)
-            throw new IllegalArgumentException(from + " > " + to);
-        byte[] copy = new byte[newLength];
-        System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
-        return copy;
-    }
-
-    private static void checkCopyOfRangeParams(byte[] original, int from, int to) {
-        if (original == null)
-            throw new NullPointerException("array is null");
-        if (from < 0)
-            throw new IllegalArgumentException(from + " should be > 0");
-        if (to < 0)
-            throw new IllegalArgumentException(to + " should be > 0");
-    }
-
-    /*
-     * This method is added because of supporting of Java 5
-     */
-    public static boolean isEmpty(String str) {
-        return str == null || str.length() == 0;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmCommons {
+    /* Prevents initialization */
+    private ChmCommons() {
+    }
+
+    public static void assertByteArrayNotNull(byte[] data) throws TikaException {
+        if (data == null)
+            throw new TikaException("byte[] is null");
+    }
+
+    /**
+     * Represents entry types: uncompressed, compressed
+     */
+    public enum EntryType {
+        UNCOMPRESSED, COMPRESSED
+    }
+
+    /**
+     * Represents lzx states: started decoding, not started decoding
+     */
+    public enum LzxState {
+        STARTED_DECODING, NOT_STARTED_DECODING
+    }
+
+    /**
+     * Represents intel file states during decompression
+     */
+    public enum IntelState {
+        STARTED, NOT_STARTED
+    }
+
+    /**
+     * Represents lzx block types in order to decompress differently
+     */
+    public final static int UNDEFINED = 0;
+    public final static int VERBATIM = 1;
+    public final static int ALIGNED_OFFSET = 2;
+    public final static int UNCOMPRESSED = 3;
+
+    /**
+     * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
+     * i.e 2^X
+     * 
+     * @param window
+     *            chmLzxControlData.getWindowSize()
+     * 
+     * @return window size
+     */
+    public static int getWindowSize(int window) {
+        int win = 0;
+        while (window > 1) {
+            window >>>= 1;
+            win++;
+        }
+        return win;
+    }
+
+    public static byte[] getChmBlockSegment(byte[] data,
+            ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+            int lzxcBlockLength) throws TikaException {
+        ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
+                lzxcBlockOffset, lzxcBlockLength);
+        int blockLength = -1;
+        // TODO add int_max_value checking
+        if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
+            blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
+                    .getBlockAddress()[blockNumber]);
+        } else {
+            /* new code */
+            if (blockNumber >= resetTable.getBlockAddress().length)
+                blockLength = 0;
+            else
+                /* end new code */
+                blockLength = (int) (lzxcBlockLength - resetTable
+                        .getBlockAddress()[blockNumber]);
+        }
+        byte[] t = ChmCommons
+                .copyOfRange(
+                        data,
+                        (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
+                        (int) (lzxcBlockOffset
+                                + resetTable.getBlockAddress()[blockNumber] + blockLength));
+        return (t != null) ? t : new byte[1];
+    }
+
+    /**
+     * Returns textual representation of LangID
+     * 
+     * @param langID
+     * 
+     * @return language name
+     */
+    public static String getLanguage(long langID) {
+        /* Potential problem with casting */
+        switch ((int) langID) {
+        case 1025:
+            return "Arabic";
+        case 1069:
+            return "Basque";
+        case 1027:
+            return "Catalan";
+        case 2052:
+            return "Chinese (Simplified)";
+        case 1028:
+            return "Chinese (Traditional)";
+        case 1029:
+            return "Czech";
+        case 1030:
+            return "Danish";
+        case 1043:
+            return "Dutch";
+        case 1033:
+            return "English (United States)";
+        case 1035:
+            return "Finnish";
+        case 1036:
+            return "French";
+        case 1031:
+            return "German";
+        case 1032:
+            return "Greek";
+        case 1037:
+            return "Hebrew";
+        case 1038:
+            return "Hungarian";
+        case 1040:
+            return "Italian";
+        case 1041:
+            return "Japanese";
+        case 1042:
+            return "Korean";
+        case 1044:
+            return "Norwegian";
+        case 1045:
+            return "Polish";
+        case 2070:
+            return "Portuguese";
+        case 1046:
+            return "Portuguese (Brazil)";
+        case 1049:
+            return "Russian";
+        case 1051:
+            return "Slovakian";
+        case 1060:
+            return "Slovenian";
+        case 3082:
+            return "Spanish";
+        case 1053:
+            return "Swedish";
+        case 1055:
+            return "Turkish";
+        default:
+            return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
+        }
+    }
+
+    /**
+     * Checks skippable patterns
+     * 
+     * @param directoryListingEntry
+     * 
+     * @return boolean
+     */
+    public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
+        return (directoryListingEntry.getName().startsWith("/$")
+                || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
+                .getName().startsWith("::")) ? true : false;
+    }
+
+    /**
+     * Writes byte[][] to the file
+     * 
+     * @param buffer
+     * @param fileToBeSaved
+     *            file name
+     * @throws TikaException 
+     */
+    public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException {
+        FileOutputStream output = null;
+        if (buffer != null && fileToBeSaved != null
+                && !ChmCommons.isEmpty(fileToBeSaved)) {
+            try {
+                output = new FileOutputStream(fileToBeSaved);
+                for (byte[] bufferEntry : buffer) {
+                    output.write(bufferEntry);
+                }
+            } catch (FileNotFoundException e) {
+                throw new TikaException(e.getMessage());
+            } catch (IOException e) {
+                e.printStackTrace();
+            } finally {
+                if (output != null)
+                    try {
+                        output.flush();
+                        output.close();
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+            }
+        }
+    }
+
+    /**
+     * Reverses the order of given array
+     * 
+     * @param array
+     */
+    public static void reverse(byte[] array) {
+        if (array == null) {
+            return;
+        }
+        int i = 0;
+        int j = array.length - 1;
+        byte tmp;
+        while (j > i) {
+            tmp = array[j];
+            array[j] = array[i];
+            array[i] = tmp;
+            j--;
+            i++;
+        }
+    }
+
+    /**
+     * Returns an index of the reset table
+     * 
+     * @param text
+     * @param pattern
+     * @return index of the reset table
+     * @throws ChmParsingException 
+     */
+    public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException {
+        return (indexOf(text, pattern)) - 4;
+    }
+
+    /**
+     * Searches some pattern in byte[]
+     * 
+     * @param text
+     *            byte[]
+     * @param pattern
+     *            byte[]
+     * @return an index, if nothing found returns -1
+     * @throws ChmParsingException 
+     */
+    public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
+        int[] next = null;
+        int i = 0, j = -1;
+
+        /* Preprocessing */
+        if (pattern != null && text != null) {
+            next = new int[pattern.length];
+            next[0] = -1;
+        } else
+            throw new ChmParsingException("pattern and/or text should not be null");
+
+        /* Computes a failure function */
+        while (i < pattern.length - 1) {
+            if (j == -1 || pattern[i] == pattern[j]) {
+                i++;
+                j++;
+                if (pattern[i] != pattern[j])
+                    next[i] = j;
+                else
+                    next[i] = next[j];
+            } else
+                j = next[j];
+        }
+
+        /* Reinitializes local variables */
+        i = j = 0;
+
+        /* Matching */
+        while (i < text.length && j < pattern.length) {
+            if (j == -1 || pattern[j] == text[i]) {
+                i++;
+                j++;
+            } else
+                j = next[j];
+        }
+        if (j == pattern.length)
+            return (i - j); // match found at offset i - M
+        else
+            return -1; // not found
+    }
+
+    /**
+     * Searches for some pattern in the directory listing entry list
+     * 
+     * @param list
+     * @param pattern
+     * @return an index, if nothing found returns -1
+     */
+    public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+        int place = 0;
+        for (DirectoryListingEntry directoryListingEntry : list) {
+            if (directoryListingEntry.toString().contains(pattern)) return place;
+            ++place;
+        }
+        return -1;// not found
+    }
+
+    /*
+     * This method is added because of supporting of Java 5
+     */
+    public static byte[] copyOfRange(byte[] original, int from, int to) {
+        checkCopyOfRangeParams(original, from, to);
+        int newLength = to - from;
+        if (newLength < 0)
+            throw new IllegalArgumentException(from + " > " + to);
+        byte[] copy = new byte[newLength];
+        System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
+        return copy;
+    }
+
+    private static void checkCopyOfRangeParams(byte[] original, int from, int to) {
+        if (original == null)
+            throw new NullPointerException("array is null");
+        if (from < 0)
+            throw new IllegalArgumentException(from + " should be > 0");
+        if (to < 0)
+            throw new IllegalArgumentException(to + " should be > 0");
+    }
+
+    /*
+     * This method is added because of supporting of Java 5
+     */
+    public static boolean isEmpty(String str) {
+        return str == null || str.length() == 0;
+    }
+
+}

[37/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index c9db0da..f743aa2 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -1,141 +1,141 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-office-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser office bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-office-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-bundle</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-web-bundle</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-bundle</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.office.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-office-module;inline=true,
-              commons-lang;inline=true,
-              commons-io;inline=true,
-              commons-codec;inline=true,
-              poi;inline=true,
-              poi-scratchpad;inline=true,
-              poi-ooxml;inline=true,
-              poi-ooxml-schemas;inline=true;
-              jackcess;inline=true,
-              jackcess-encrypt;inline=true,
-              java-libpst;inline=true,
-              curvesapi;inline=true,
-              xmlbeans;inline=true,
-              bcprov-jdk15on;inline=true,
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.chm.*,
-              org.apache.tika.parser.mbox.*,
-              org.apache.tika.parser.microsoft.*,
-              org.apache.tika.parser.microsoft.ooxml.*,
-              org.apache.tika.parser.opc.*,
-              org.apache.tika.parser.odf.*,
-              org.apache.tika.parser.opendocument.*,
-              org.apache.tika.parser.rtf.*
-            </Export-Package>
-            <Import-Package>
-              !org.junit,
-              !org.junit.*,
-              !junit.*,
-              *,
-              com.microsoft.schemas.office.powerpoint;resolution:=optional,
-              com.microsoft.schemas.office.word;resolution:=optional,
-              com.sun.javadoc;resolution:=optional,
-              com.sun.xml.bind.marshaller;resolution:=optional,
-              com.sun.xml.internal.bind.marshaller;resolution:=optional,
-              com.sun.msv.datatype;resolution:=optional,
-              com.sun.msv.datatype.xsd;resolution:=optional,
-              com.sun.tools.javadoc;resolution:=optional,
-              org.apache.crimson.jaxp;resolution:=optional,
-              org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
-              org.apache.tools.ant;resolution:=optional,
-              org.apache.tools.ant.taskdefs;resolution:=optional,
-              org.apache.tools.ant.types;resolution:=optional,
-              org.apache.xml.resolver;resolution:=optional,
-              org.apache.xml.resolver.tools;resolution:=optional,
-              org.apache.xml.security;resolution:=optional,
-              org.apache.xml.security.c14n;resolution:=optional,
-              org.apache.xml.security.utils;resolution:=optional,
-              org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
-              org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
-              org.bouncycastle.cert;resolution:=optional,
-              org.bouncycastle.cert.jcajce;resolution:=optional,
-              org.bouncycastle.cert.ocsp;resolution:=optional,
-              org.bouncycastle.cms;resolution:=optional,
-              org.bouncycastle.cms.bc;resolution:=optional,
-              org.bouncycastle.operator;resolution:=optional,
-              org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional,
-              org.etsi.uri.x01903.v14;resolution:=optional,
-              org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
-              org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
-              org.apache.tika.parser.html.HtmlParser;resolution:=optional,
-              org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-office-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser office bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-office-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-bundle</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-web-bundle</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-bundle</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.office.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-office-module;inline=true,
+              commons-lang;inline=true,
+              commons-io;inline=true,
+              commons-codec;inline=true,
+              poi;inline=true,
+              poi-scratchpad;inline=true,
+              poi-ooxml;inline=true,
+              poi-ooxml-schemas;inline=true;
+              jackcess;inline=true,
+              jackcess-encrypt;inline=true,
+              java-libpst;inline=true,
+              curvesapi;inline=true,
+              xmlbeans;inline=true,
+              bcprov-jdk15on;inline=true,
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.chm.*,
+              org.apache.tika.parser.mbox.*,
+              org.apache.tika.parser.microsoft.*,
+              org.apache.tika.parser.microsoft.ooxml.*,
+              org.apache.tika.parser.opc.*,
+              org.apache.tika.parser.odf.*,
+              org.apache.tika.parser.opendocument.*,
+              org.apache.tika.parser.rtf.*
+            </Export-Package>
+            <Import-Package>
+              !org.junit,
+              !org.junit.*,
+              !junit.*,
+              *,
+              com.microsoft.schemas.office.powerpoint;resolution:=optional,
+              com.microsoft.schemas.office.word;resolution:=optional,
+              com.sun.javadoc;resolution:=optional,
+              com.sun.xml.bind.marshaller;resolution:=optional,
+              com.sun.xml.internal.bind.marshaller;resolution:=optional,
+              com.sun.msv.datatype;resolution:=optional,
+              com.sun.msv.datatype.xsd;resolution:=optional,
+              com.sun.tools.javadoc;resolution:=optional,
+              org.apache.crimson.jaxp;resolution:=optional,
+              org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+              org.apache.tools.ant;resolution:=optional,
+              org.apache.tools.ant.taskdefs;resolution:=optional,
+              org.apache.tools.ant.types;resolution:=optional,
+              org.apache.xml.resolver;resolution:=optional,
+              org.apache.xml.resolver.tools;resolution:=optional,
+              org.apache.xml.security;resolution:=optional,
+              org.apache.xml.security.c14n;resolution:=optional,
+              org.apache.xml.security.utils;resolution:=optional,
+              org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
+              org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
+              org.bouncycastle.cert;resolution:=optional,
+              org.bouncycastle.cert.jcajce;resolution:=optional,
+              org.bouncycastle.cert.ocsp;resolution:=optional,
+              org.bouncycastle.cms;resolution:=optional,
+              org.bouncycastle.cms.bc;resolution:=optional,
+              org.bouncycastle.operator;resolution:=optional,
+              org.bouncycastle.operator.bc;resolution:=optional,
+              org.bouncycastle.tsp;resolution:=optional,
+              org.etsi.uri.x01903.v14;resolution:=optional,
+              org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
+              org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+              org.apache.tika.parser.html.HtmlParser;resolution:=optional,
+              org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-package-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-package-bundle/pom.xml b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
index 4d292d7..d2b55d7 100644
--- a/tika-parser-bundles/tika-parser-package-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
@@ -1,80 +1,80 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-package-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser package bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.pkg.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-package-module;inline=true,
-              commons-io;inline=true,
-              commons-codec;inline=true,
-              xz;inline=true,
-              commons-compress;inline=true,
-              junrar;inline=true
-            </Embed-Dependency>
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.pkg.*,
-              org.apache.tika.parser.iwork.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              org.apache.commons.vfs2;resolution:=optional,
-              org.apache.commons.vfs2.provider;resolution:=optional,
-              org.apache.commons.vfs2.util;resolution:=optional,
-              
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-package-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser package bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.pkg.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-package-module;inline=true,
+              commons-io;inline=true,
+              commons-codec;inline=true,
+              xz;inline=true,
+              commons-compress;inline=true,
+              junrar;inline=true
+            </Embed-Dependency>
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.pkg.*,
+              org.apache.tika.parser.iwork.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              org.apache.commons.vfs2;resolution:=optional,
+              org.apache.commons.vfs2.provider;resolution:=optional,
+              org.apache.commons.vfs2.util;resolution:=optional,
+              
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index 25eef2e..fe1a269 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -1,109 +1,109 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-pdf-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser pdf bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-pdf-module;inline=true,
-              tika-parser-multimedia-module;inline=true,
-              tika-parser-xmp-commons;inline=true,
-              commons-io;inline=true,
-              pdfbox;inline=true,
-              pdfbox-tools;inline=true,
-              pdfbox-debugger;inline=true,
-              bcmail-jdk15on;inline=true,
-              bcprov-jdk15on;inline=true,
-              fontbox;inline=true,
-              jempbox;inline=true,
-              bcpkix-jdk15on;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.pdf.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              com.ibm.icu.text;resolution:=optional,
-              com.coremedia.iso;resolution:=optional,
-              com.coremedia.iso.boxes;resolution:=optional,
-              com.coremedia.iso.boxes.apple;resolution:=optional,
-              com.coremedia.iso.boxes.sampleentry;resolution:=optional,
-              com.drew.imaging.jpeg;resolution:=optional,
-              com.drew.imaging.riff;resolution:=optional,
-              com.drew.imaging.tiff;resolution:=optional,
-              com.drew.imaging.webp;resolution:=optional,
-              com.drew.lang;resolution:=optional,
-              com.drew.metadata;resolution:=optional,
-              com.drew.metadata.exif;resolution:=optional,
-              com.drew.metadata.iptc;resolution:=optional,
-              com.drew.metadata.jpeg;resolution:=optional,
-              com.googlecode.mp4parser;resolution:=optional,
-              com.googlecode.mp4parser.boxes.apple;resolution:=optional,
-              com.googlecode.mp4parser.util;resolution:=optional,
-              javax.mail;resolution:=optional,
-              javax.mail.internet;resolution:=optional,
-              org.bouncycastle.cert;resolution:=optional,
-              org.bouncycastle.cert.jcajce;resolution:=optional,
-              org.bouncycastle.cert.ocsp;resolution:=optional,
-              org.bouncycastle.cms.bc;resolution:=optional,
-              org.bouncycastle.operator;resolution:=optional,
-              org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional,
-              org.apache.commons.exec;resolution:=optional,
-              org.apache.commons.exec.environment;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-pdf-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser pdf bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-pdf-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-pdf-module;inline=true,
+              tika-parser-multimedia-module;inline=true,
+              tika-parser-xmp-commons;inline=true,
+              commons-io;inline=true,
+              pdfbox;inline=true,
+              pdfbox-tools;inline=true,
+              pdfbox-debugger;inline=true,
+              bcmail-jdk15on;inline=true,
+              bcprov-jdk15on;inline=true,
+              fontbox;inline=true,
+              jempbox;inline=true,
+              bcpkix-jdk15on;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.pdf.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              com.ibm.icu.text;resolution:=optional,
+              com.coremedia.iso;resolution:=optional,
+              com.coremedia.iso.boxes;resolution:=optional,
+              com.coremedia.iso.boxes.apple;resolution:=optional,
+              com.coremedia.iso.boxes.sampleentry;resolution:=optional,
+              com.drew.imaging.jpeg;resolution:=optional,
+              com.drew.imaging.riff;resolution:=optional,
+              com.drew.imaging.tiff;resolution:=optional,
+              com.drew.imaging.webp;resolution:=optional,
+              com.drew.lang;resolution:=optional,
+              com.drew.metadata;resolution:=optional,
+              com.drew.metadata.exif;resolution:=optional,
+              com.drew.metadata.iptc;resolution:=optional,
+              com.drew.metadata.jpeg;resolution:=optional,
+              com.googlecode.mp4parser;resolution:=optional,
+              com.googlecode.mp4parser.boxes.apple;resolution:=optional,
+              com.googlecode.mp4parser.util;resolution:=optional,
+              javax.mail;resolution:=optional,
+              javax.mail.internet;resolution:=optional,
+              org.bouncycastle.cert;resolution:=optional,
+              org.bouncycastle.cert.jcajce;resolution:=optional,
+              org.bouncycastle.cert.ocsp;resolution:=optional,
+              org.bouncycastle.cms.bc;resolution:=optional,
+              org.bouncycastle.operator;resolution:=optional,
+              org.bouncycastle.operator.bc;resolution:=optional,
+              org.bouncycastle.tsp;resolution:=optional,
+              org.apache.commons.exec;resolution:=optional,
+              org.apache.commons.exec.environment;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml b/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
index 578ecab..9408859 100644
--- a/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
@@ -1,202 +1,202 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-scientific-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser scientific bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-scientific-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.ctakes</groupId>
-      <artifactId>ctakes-core</artifactId>
-      <version>3.2.2</version>
-      <scope>provided</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.scientific.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-scientific-module;inline=true,
-              commons-csv;inline=true,
-              commons-exec;inline=true,
-              commons-codec;inline=true,
-              commons-io;inline=true,
-              json-simple;inline=true,
-              sis-utility;inline=true,
-              sis-netcdf;inline=true,
-              sis-metadata;inline=true,
-              sis-storage;inline=true,
-              netcdf4;inline=true,
-              grib;inline=true,
-              cdm;inline=true,
-              httpservices;inline=true,
-              jmatio;inline=true,
-              jsr-275;inline=true,
-              jcip-annotations;inline=true,
-              opennlp-tools;inline=true,
-              opennlp-maxent;inline=true,
-              jwnl;inline=true,
-              geoapi;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.ctakes.*,
-              org.apache.tika.parser.dif.*,
-              org.apache.tika.parser.envi.*,
-              org.apache.tika.parser.gdal.*
-            </Export-Package>
-            <Import-Package>
-              !org.apache.ctakes.*,
-              !org.apache.uima.*,
-              *,
-              colorspace;resolution:=optional,
-              org.apache.sis;resolution:=optional,
-              org.apache.sis.distance;resolution:=optional,
-              org.apache.sis.geometry;resolution:=optional,
-              com.beust.jcommander;resolution:=optional,
-              com.google.common.base;resolution:=optional,
-              com.google.common.math;resolution:=optional,
-              com.google.protobuf;resolution:=optional,
-              ucar.units;resolution:=optional,
-              ucar.httpservices;resolution:=optional,
-              ucar.nc2.util;resolution:=optional,
-              ucar.nc2.util.cache;resolution:=optional,
-              ucar.nc2.dataset;resolution:=optional,
-              ucar.nc2;resolution:=optional,
-              ucar.nc2.constants;resolution:=optional,
-              ucar.nc2.dt;resolution:=optional,
-              ucar.nc2.dt.grid;resolution:=optional,
-              ucar.nc2.ft;resolution:=optional,
-              ucar.nc2.iosp;resolution:=optional,
-              ucar.nc2.iosp.hdf4;resolution:=optional,
-              ucar.nc2.ncml;resolution:=optional,
-              ucar.nc2.stream;resolution:=optional,
-              ucar.nc2.time;resolution:=optional,
-              ucar.nc2.units;resolution:=optional,
-              ucar.nc2.wmo;resolution:=optional,
-              ucar.nc2.write;resolution:=optional,
-              ucar.ma2;resolution:=optional,
-              ucar.grib;resolution:=optional,
-              ucar.grib.grib1;resolution:=optional,
-              ucar.grib.grib2;resolution:=optional,
-              ucar.grid;resolution:=optional,
-              ucar.unidata.geoloc;resolution:=optional,
-              ucar.unidata.geoloc.projection;resolution:=optional,
-              ucar.unidata.geoloc.projection.proj4;resolution:=optional,
-              ucar.unidata.geoloc.projection.sat;resolution:=optional,
-              ucar.unidata.io;resolution:=optional,
-              ucar.unidata.util;resolution:=optional,
-              com.jmatio.io;resolution:=optional,
-              com.sun.jna;resolution:=optional,
-              com.sun.jna.ptr;resolution:=optional,
-              com.sun.xml.bind.marshaller;resolution:=optional,
-              com.sun.xml.internal.bind.marshaller;resolution:=optional,
-              com.sun.msv.datatype;resolution:=optional,
-              com.sun.msv.datatype.xsd;resolution:=optional,
-              com.sun.tools.javadoc;resolution:=optional,
-              sun.misc;resolution:=optional,
-              sun.reflect.generics.reflectiveObjects;resolution:=optional,
-              org.quartz;resolution:=optional,
-              org.quartz.impl;resolution:=optional,
-              icc;resolution:=optional,
-              org.jdom;resolution:=optional,
-              org.jdom.input;resolution:=optional,
-              org.jdom.output;resolution:=optional,
-              org.jdom2;resolution:=optional,
-              org.jdom2.input;resolution:=optional,
-              org.jdom2.input.sax;resolution:=optional,
-              org.jdom2.output;resolution:=optional,
-              org.jdom2.filter;resolution:=optional,
-              javax.measure.converter;resolution:=optional,
-              javax.servlet.annotation;resolution:=optional,
-              javax.servlet;resolution:=optional,
-              javax.servlet.http;resolution:=optional,
-              jj2000.j2k.codestream;resolution:=optional,
-              jj2000.j2k.codestream.reader;resolution:=optional,
-              jj2000.j2k.decoder;resolution:=optional,
-              jj2000.j2k.entropy.decoder;resolution:=optional,
-              jj2000.j2k.fileformat.reader;resolution:=optional,
-              jj2000.j2k.image;resolution:=optional,
-              jj2000.j2k.image.invcomptransf;resolution:=optional,
-              jj2000.j2k.image.output;resolution:=optional,
-              jj2000.j2k.io;resolution:=optional,
-              jj2000.j2k.quantization.dequantizer;resolution:=optional,
-              jj2000.j2k.roi;resolution:=optional,
-              jj2000.j2k.util;resolution:=optional,
-              jj2000.j2k.wavelet.synthesis;resolution:=optional,
-              org.itadaki.bzip2;resolution:=optional,
-              org.jsoup;resolution:=optional,
-              org.jsoup.nodes;resolution:=optional,
-              org.jsoup.select;resolution:=optional,
-              opennlp.maxent;resolution:=optional,
-              opennlp.tools.namefind;resolution:=optional,
-              net.didion.jwnl;resolution:=optional,
-              org.joda.time;resolution:=optional,
-              org.joda.time.chrono;resolution:=optional,
-              org.joda.time.field;resolution:=optional,
-              org.joda.time.format;resolution:=optional,
-              org.apache.http;resolution:=optional,
-              org.apache.http.auth;resolution:=optional,
-              org.apache.http.client;resolution:=optional,
-              org.apache.http.client.entity;resolution:=optional,
-              org.apache.http.client.methods;resolution:=optional,
-              org.apache.http.conn;resolution:=optional,
-              org.apache.http.conn.scheme;resolution:=optional,
-              org.apache.http.cookie;resolution:=optional,
-              org.apache.http.entity;resolution:=optional,
-              org.apache.http.impl.client;resolution:=optional,
-              org.apache.http.impl.conn;resolution:=optional,
-              org.apache.http.message;resolution:=optional,
-              org.apache.http.params;resolution:=optional,
-              org.apache.http.protocol;resolution:=optional,
-              org.apache.http.util;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-scientific-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser scientific bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-scientific-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.scientific.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-scientific-module;inline=true,
+              commons-csv;inline=true,
+              commons-exec;inline=true,
+              commons-codec;inline=true,
+              commons-io;inline=true,
+              json-simple;inline=true,
+              sis-utility;inline=true,
+              sis-netcdf;inline=true,
+              sis-metadata;inline=true,
+              sis-storage;inline=true,
+              netcdf4;inline=true,
+              grib;inline=true,
+              cdm;inline=true,
+              httpservices;inline=true,
+              jmatio;inline=true,
+              jsr-275;inline=true,
+              jcip-annotations;inline=true,
+              opennlp-tools;inline=true,
+              opennlp-maxent;inline=true,
+              jwnl;inline=true,
+              geoapi;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.ctakes.*,
+              org.apache.tika.parser.dif.*,
+              org.apache.tika.parser.envi.*,
+              org.apache.tika.parser.gdal.*
+            </Export-Package>
+            <Import-Package>
+              !org.apache.ctakes.*,
+              !org.apache.uima.*,
+              *,
+              colorspace;resolution:=optional,
+              org.apache.sis;resolution:=optional,
+              org.apache.sis.distance;resolution:=optional,
+              org.apache.sis.geometry;resolution:=optional,
+              com.beust.jcommander;resolution:=optional,
+              com.google.common.base;resolution:=optional,
+              com.google.common.math;resolution:=optional,
+              com.google.protobuf;resolution:=optional,
+              ucar.units;resolution:=optional,
+              ucar.httpservices;resolution:=optional,
+              ucar.nc2.util;resolution:=optional,
+              ucar.nc2.util.cache;resolution:=optional,
+              ucar.nc2.dataset;resolution:=optional,
+              ucar.nc2;resolution:=optional,
+              ucar.nc2.constants;resolution:=optional,
+              ucar.nc2.dt;resolution:=optional,
+              ucar.nc2.dt.grid;resolution:=optional,
+              ucar.nc2.ft;resolution:=optional,
+              ucar.nc2.iosp;resolution:=optional,
+              ucar.nc2.iosp.hdf4;resolution:=optional,
+              ucar.nc2.ncml;resolution:=optional,
+              ucar.nc2.stream;resolution:=optional,
+              ucar.nc2.time;resolution:=optional,
+              ucar.nc2.units;resolution:=optional,
+              ucar.nc2.wmo;resolution:=optional,
+              ucar.nc2.write;resolution:=optional,
+              ucar.ma2;resolution:=optional,
+              ucar.grib;resolution:=optional,
+              ucar.grib.grib1;resolution:=optional,
+              ucar.grib.grib2;resolution:=optional,
+              ucar.grid;resolution:=optional,
+              ucar.unidata.geoloc;resolution:=optional,
+              ucar.unidata.geoloc.projection;resolution:=optional,
+              ucar.unidata.geoloc.projection.proj4;resolution:=optional,
+              ucar.unidata.geoloc.projection.sat;resolution:=optional,
+              ucar.unidata.io;resolution:=optional,
+              ucar.unidata.util;resolution:=optional,
+              com.jmatio.io;resolution:=optional,
+              com.sun.jna;resolution:=optional,
+              com.sun.jna.ptr;resolution:=optional,
+              com.sun.xml.bind.marshaller;resolution:=optional,
+              com.sun.xml.internal.bind.marshaller;resolution:=optional,
+              com.sun.msv.datatype;resolution:=optional,
+              com.sun.msv.datatype.xsd;resolution:=optional,
+              com.sun.tools.javadoc;resolution:=optional,
+              sun.misc;resolution:=optional,
+              sun.reflect.generics.reflectiveObjects;resolution:=optional,
+              org.quartz;resolution:=optional,
+              org.quartz.impl;resolution:=optional,
+              icc;resolution:=optional,
+              org.jdom;resolution:=optional,
+              org.jdom.input;resolution:=optional,
+              org.jdom.output;resolution:=optional,
+              org.jdom2;resolution:=optional,
+              org.jdom2.input;resolution:=optional,
+              org.jdom2.input.sax;resolution:=optional,
+              org.jdom2.output;resolution:=optional,
+              org.jdom2.filter;resolution:=optional,
+              javax.measure.converter;resolution:=optional,
+              javax.servlet.annotation;resolution:=optional,
+              javax.servlet;resolution:=optional,
+              javax.servlet.http;resolution:=optional,
+              jj2000.j2k.codestream;resolution:=optional,
+              jj2000.j2k.codestream.reader;resolution:=optional,
+              jj2000.j2k.decoder;resolution:=optional,
+              jj2000.j2k.entropy.decoder;resolution:=optional,
+              jj2000.j2k.fileformat.reader;resolution:=optional,
+              jj2000.j2k.image;resolution:=optional,
+              jj2000.j2k.image.invcomptransf;resolution:=optional,
+              jj2000.j2k.image.output;resolution:=optional,
+              jj2000.j2k.io;resolution:=optional,
+              jj2000.j2k.quantization.dequantizer;resolution:=optional,
+              jj2000.j2k.roi;resolution:=optional,
+              jj2000.j2k.util;resolution:=optional,
+              jj2000.j2k.wavelet.synthesis;resolution:=optional,
+              org.itadaki.bzip2;resolution:=optional,
+              org.jsoup;resolution:=optional,
+              org.jsoup.nodes;resolution:=optional,
+              org.jsoup.select;resolution:=optional,
+              opennlp.maxent;resolution:=optional,
+              opennlp.tools.namefind;resolution:=optional,
+              net.didion.jwnl;resolution:=optional,
+              org.joda.time;resolution:=optional,
+              org.joda.time.chrono;resolution:=optional,
+              org.joda.time.field;resolution:=optional,
+              org.joda.time.format;resolution:=optional,
+              org.apache.http;resolution:=optional,
+              org.apache.http.auth;resolution:=optional,
+              org.apache.http.client;resolution:=optional,
+              org.apache.http.client.entity;resolution:=optional,
+              org.apache.http.client.methods;resolution:=optional,
+              org.apache.http.conn;resolution:=optional,
+              org.apache.http.conn.scheme;resolution:=optional,
+              org.apache.http.cookie;resolution:=optional,
+              org.apache.http.entity;resolution:=optional,
+              org.apache.http.impl.client;resolution:=optional,
+              org.apache.http.impl.conn;resolution:=optional,
+              org.apache.http.message;resolution:=optional,
+              org.apache.http.params;resolution:=optional,
+              org.apache.http.protocol;resolution:=optional,
+              org.apache.http.util;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-text-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-text-bundle/pom.xml b/tika-parser-bundles/tika-parser-text-bundle/pom.xml
index bf4e14a..31d06ac 100644
--- a/tika-parser-bundles/tika-parser-text-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-text-bundle/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-text-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser text bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.text.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-text-module;inline=true,
-              juniversalchardet;inline=true,
-              commons-codec;inline=true,
-              commons-io;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.strings.*,
-              org.apache.tika.parser.txt.*,
-              org.apache.tika.parser.audio.*,
-              org.apache.tika.parser.xml.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              javax.servlet.annotation;resolution:=optional,
-              javax.servlet;resolution:=optional,
-              javax.servlet.http;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-text-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser text bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.text.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-text-module;inline=true,
+              juniversalchardet;inline=true,
+              commons-codec;inline=true,
+              commons-io;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.strings.*,
+              org.apache.tika.parser.txt.*,
+              org.apache.tika.parser.audio.*,
+              org.apache.tika.parser.xml.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              javax.servlet.annotation;resolution:=optional,
+              javax.servlet;resolution:=optional,
+              javax.servlet.http;resolution:=optional
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-web-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-web-bundle/pom.xml b/tika-parser-bundles/tika-parser-web-bundle/pom.xml
index 72d22da..a23267d 100644
--- a/tika-parser-bundles/tika-parser-web-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-web-bundle/pom.xml
@@ -1,93 +1,93 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-web-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser web bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-web-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            <Bundle-Activator>org.apache.tika.module.web.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-web-module;inline=true,
-              tagsoup;inline=true,
-              boilerpipe;inline=true,
-              rome;inline=true,
-              rome-utils;inline=true,
-              apache-mime4j-core;inline=true,
-              apache-mime4j-dom;inline=true,
-              commons-io;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.feed.*,
-              org.apache.tika.parser.html.*,
-              org.apache.tika.parser.iptc.*,
-              org.apache.tika.parser.mail.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              org.apache.xerces.parsers;resolution:=optional,
-              org.apache.xerces.util;resolution:=optional,
-              org.apache.xerces.xni;resolution:=optional,
-              org.apache.xerces.xni.parser;resolution:=optional,
-              org.cyberneko.html.xercesbridge;resolution:=optional,
-              org.jdom;resolution:=optional,
-              org.jdom.input;resolution:=optional,
-              org.jdom.output;resolution:=optional,
-              org.jdom2;resolution:=optional,
-              org.jdom2.input;resolution:=optional,
-              org.jdom2.input.sax;resolution:=optional,
-              org.jdom2.output;resolution:=optional,
-              org.jdom2.filter;resolution:=optional,
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-bundles</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-web-bundle</artifactId>
+  <packaging>bundle</packaging>
+  <name>Apache Tika parser web bundle</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-web-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Bundle-Activator>org.apache.tika.module.web.internal.Activator</Bundle-Activator>
+            <Embed-Dependency>
+              tika-parser-web-module;inline=true,
+              tagsoup;inline=true,
+              boilerpipe;inline=true,
+              rome;inline=true,
+              rome-utils;inline=true,
+              apache-mime4j-core;inline=true,
+              apache-mime4j-dom;inline=true,
+              commons-io;inline=true
+            </Embed-Dependency> 
+            <Embed-Transitive>true</Embed-Transitive>
+            <Export-Package>
+              org.apache.tika.parser.feed.*,
+              org.apache.tika.parser.html.*,
+              org.apache.tika.parser.iptc.*,
+              org.apache.tika.parser.mail.*
+            </Export-Package>
+            <Import-Package>
+              *,
+              org.apache.xerces.parsers;resolution:=optional,
+              org.apache.xerces.util;resolution:=optional,
+              org.apache.xerces.xni;resolution:=optional,
+              org.apache.xerces.xni.parser;resolution:=optional,
+              org.cyberneko.html.xercesbridge;resolution:=optional,
+              org.jdom;resolution:=optional,
+              org.jdom.input;resolution:=optional,
+              org.jdom.output;resolution:=optional,
+              org.jdom2;resolution:=optional,
+              org.jdom2.input;resolution:=optional,
+              org.jdom2.input.sax;resolution:=optional,
+              org.jdom2.output;resolution:=optional,
+              org.jdom2.filter;resolution:=optional,
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-failsafe-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index eca38f1..6912f8b 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -1,206 +1,206 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parent</artifactId>
-    <version>2.0-SNAPSHOT</version>
-     <relativePath>../tika-parent/pom.xml</relativePath>
-  </parent>
-
-  <artifactId>tika-parser-modules</artifactId>
-  <packaging>pom</packaging>
-  <name>Apache Tika parser modules</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <poi.version>3.15-beta1</poi.version>
-    <!-- NOTE: sync codec version with POI -->
-    <codec.version>1.10</codec.version>
-    <pdfbox.version>2.0.2</pdfbox.version>
-    <jempbox.version>1.8.12</jempbox.version>
-    <!-- used by POI, PDFBox and Jackcess ...try to sync -->
-    <bouncycastle.version>1.54</bouncycastle.version>
-    <commons.exec>1.3</commons.exec>
-  </properties>
-
-  <modules>
-    <module>tika-parser-advanced-module</module>
-    <module>tika-parser-cad-module</module>
-    <module>tika-parser-code-module</module>
-    <module>tika-parser-crypto-module</module>
-    <module>tika-parser-database-module</module>
-    <module>tika-parser-ebook-module</module>
-    <module>tika-parser-journal-module</module>
-    <module>tika-parser-multimedia-module</module>
-    <module>tika-parser-office-module</module>
-    <module>tika-parser-package-module</module>
-    <module>tika-parser-pdf-module</module>
-    <module>tika-parser-scientific-module</module>
-    <module>tika-parser-text-module</module>
-    <module>tika-parser-web-module</module>
-    <module>tika-parser-xmp-commons</module>
-  </modules>
-
-  <dependencies>
-      <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.core</artifactId>
-      <scope>provided</scope>
-      <optional>true</optional>
-    </dependency>
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.compendium</artifactId>
-      <scope>provided</scope>
-      <optional>true</optional>
-    </dependency>
-    <!-- Test dependencies -->
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-test-resources</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.mockito</groupId>
-      <artifactId>mockito-core</artifactId>
-      <version>1.7</version>
-      <scope>test</scope>
-    </dependency>
-     <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-junit4</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-container-native</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.felix</groupId>
-      <artifactId>org.apache.felix.framework</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.exam</groupId>
-      <artifactId>pax-exam-link-assembly</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.ops4j.pax.url</groupId>
-      <artifactId>pax-url-aether</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-simple</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>javax.inject</groupId>
-      <artifactId>javax.inject</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies> 
-  <build>
-    <pluginManagement>
-      <plugins>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-jar-plugin</artifactId>
-          <executions>
-            <execution>
-              <phase>package</phase>
-              <goals>
-                <goal>jar</goal>
-              </goals>
-              <configuration>
-                <useDefaultManifestFile>true</useDefaultManifestFile>
-                <includes>
-                  <include>org/apache/tika/**</include>
-                  <include>META-INF/**</include>
-                </includes>
-              </configuration>
-            </execution>
-          </executions>
-        </plugin>
-        <plugin>
-          <artifactId>maven-failsafe-plugin</artifactId>
-          <executions>
-            <execution>
-              <goals>
-                <goal>integration-test</goal>
-                <goal>verify</goal>
-              </goals>
-            </execution>
-          </executions>
-          <configuration>
-            <systemPropertyVariables>
-              <org.ops4j.pax.logging.DefaultServiceLog.level>
-                WARN
-              </org.ops4j.pax.logging.DefaultServiceLog.level>
-            </systemPropertyVariables>
-            <systemProperties>
-              <property>
-                <name>project.bundle.file</name>
-                <value>target/${project.build.finalName}-bundle.jar</value>
-              </property>
-            </systemProperties>
-          </configuration>
-        </plugin>
-        <plugin>
-          <artifactId>maven-assembly-plugin</artifactId>
-          <executions>
-            <execution>
-              <phase>pre-integration-test</phase>
-              <goals>
-                <goal>single</goal>
-              </goals>
-              <configuration>
-                <descriptor>test-bundles.xml</descriptor>
-                <finalName>test</finalName>
-                <attach>false</attach>
-              </configuration>
-            </execution>
-          </executions>
-        </plugin>
-      </plugins>
-    </pluginManagement>
-  </build>
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parent</artifactId>
+    <version>2.0-SNAPSHOT</version>
+     <relativePath>../tika-parent/pom.xml</relativePath>
+  </parent>
+
+  <artifactId>tika-parser-modules</artifactId>
+  <packaging>pom</packaging>
+  <name>Apache Tika parser modules</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <poi.version>3.15-beta1</poi.version>
+    <!-- NOTE: sync codec version with POI -->
+    <codec.version>1.10</codec.version>
+    <pdfbox.version>2.0.2</pdfbox.version>
+    <jempbox.version>1.8.12</jempbox.version>
+    <!-- used by POI, PDFBox and Jackcess ...try to sync -->
+    <bouncycastle.version>1.54</bouncycastle.version>
+    <commons.exec>1.3</commons.exec>
+  </properties>
+
+  <modules>
+    <module>tika-parser-advanced-module</module>
+    <module>tika-parser-cad-module</module>
+    <module>tika-parser-code-module</module>
+    <module>tika-parser-crypto-module</module>
+    <module>tika-parser-database-module</module>
+    <module>tika-parser-ebook-module</module>
+    <module>tika-parser-journal-module</module>
+    <module>tika-parser-multimedia-module</module>
+    <module>tika-parser-office-module</module>
+    <module>tika-parser-package-module</module>
+    <module>tika-parser-pdf-module</module>
+    <module>tika-parser-scientific-module</module>
+    <module>tika-parser-text-module</module>
+    <module>tika-parser-web-module</module>
+    <module>tika-parser-xmp-commons</module>
+  </modules>
+
+  <dependencies>
+      <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.core</artifactId>
+      <scope>provided</scope>
+      <optional>true</optional>
+    </dependency>
+    <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.compendium</artifactId>
+      <scope>provided</scope>
+      <optional>true</optional>
+    </dependency>
+    <!-- Test dependencies -->
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-test-resources</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>1.7</version>
+      <scope>test</scope>
+    </dependency>
+     <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-junit4</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-container-native</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.framework</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.exam</groupId>
+      <artifactId>pax-exam-link-assembly</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ops4j.pax.url</groupId>
+      <artifactId>pax-url-aether</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.inject</groupId>
+      <artifactId>javax.inject</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies> 
+  <build>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-jar-plugin</artifactId>
+          <executions>
+            <execution>
+              <phase>package</phase>
+              <goals>
+                <goal>jar</goal>
+              </goals>
+              <configuration>
+                <useDefaultManifestFile>true</useDefaultManifestFile>
+                <includes>
+                  <include>org/apache/tika/**</include>
+                  <include>META-INF/**</include>
+                </includes>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+        <plugin>
+          <artifactId>maven-failsafe-plugin</artifactId>
+          <executions>
+            <execution>
+              <goals>
+                <goal>integration-test</goal>
+                <goal>verify</goal>
+              </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <systemPropertyVariables>
+              <org.ops4j.pax.logging.DefaultServiceLog.level>
+                WARN
+              </org.ops4j.pax.logging.DefaultServiceLog.level>
+            </systemPropertyVariables>
+            <systemProperties>
+              <property>
+                <name>project.bundle.file</name>
+                <value>target/${project.build.finalName}-bundle.jar</value>
+              </property>
+            </systemProperties>
+          </configuration>
+        </plugin>
+        <plugin>
+          <artifactId>maven-assembly-plugin</artifactId>
+          <executions>
+            <execution>
+              <phase>pre-integration-test</phase>
+              <goals>
+                <goal>single</goal>
+              </goals>
+              <configuration>
+                <descriptor>test-bundles.xml</descriptor>
+                <finalName>test</finalName>
+                <attach>false</attach>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-advanced-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/pom.xml b/tika-parser-modules/tika-parser-advanced-module/pom.xml
index 2e02904..3263fab 100644
--- a/tika-parser-modules/tika-parser-advanced-module/pom.xml
+++ b/tika-parser-modules/tika-parser-advanced-module/pom.xml
@@ -1,69 +1,69 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-advanced-module</artifactId>
-  <name>Apache Tika parser advanced module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.opennlp</groupId>
-      <artifactId>opennlp-tools</artifactId>
-      <version>1.5.3</version>
-    </dependency>
-    <dependency>
-      <groupId>org.json</groupId>
-      <artifactId>json</artifactId>
-      <version>20140107</version>
-    </dependency>
-    <!-- Apache cTAKES -->
-    <dependency>
-      <groupId>org.apache.ctakes</groupId>
-      <artifactId>ctakes-core</artifactId>
-      <version>3.2.2</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-advanced-module</artifactId>
+  <name>Apache Tika parser advanced module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-tools</artifactId>
+      <version>1.5.3</version>
+    </dependency>
+    <dependency>
+      <groupId>org.json</groupId>
+      <artifactId>json</artifactId>
+      <version>20140107</version>
+    </dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

[02/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index fadb6e9..3adaeee 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1,1131 +1,1131 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Geographic;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.LinkContentHandler;
-import org.apache.tika.sax.TeeContentHandler;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.junit.Ignore;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class HtmlParserTest extends TikaTest {
-
-    @Test
-    public void testParseAscii() throws Exception {
-        String path = "/test-documents/testHTML.html";
-        final StringWriter href = new StringWriter();
-        final StringWriter name = new StringWriter();
-        ContentHandler body = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
-            ContentHandler link = new DefaultHandler() {
-                @Override
-                public void startElement(
-                        String u, String l, String n, Attributes a)
-                        throws SAXException {
-                    if ("a".equals(l)) {
-                        if (a.getValue("href") != null) {
-                            href.append(a.getValue("href"));
-                        } else if (a.getValue("name") != null) {
-                            name.append(a.getValue("name"));
-                        }
-                    }
-                }
-            };
-            new HtmlParser().parse(
-                    stream, new TeeContentHandler(body, link),
-                    metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Tika Developers", metadata.get("Author"));
-        assertEquals("5", metadata.get("refresh"));
-
-        assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
-        assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
-
-        assertEquals("http://www.apache.org/", href.toString());
-        assertEquals("test-anchor", name.toString());
-
-        String content = body.toString();
-        assertTrue(
-                "Did not contain expected text:" + "Test Indexation Html",
-                content.contains("Test Indexation Html"));
-        assertTrue(
-                "Did not contain expected text:" + "Indexation du fichier",
-                content.contains("Indexation du fichier"));
-    }
-
-    @Test
-    @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
-    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
-        String path = "/test-documents/testXHTML_utf8.html";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                HtmlParserTest.class.getResourceAsStream(path), metadata);
-
-        assertTrue("Did not contain expected text:"
-                + "Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022", content
-                .contains("Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022"));
-
-        assertTrue("Did not contain expected text:"
-                + "Content with UTF-8 chars", content
-                .contains("Content with UTF-8 chars"));
-
-        assertTrue("Did not contain expected text:" + "\u221a\u2022\u221a�\u221a\u2202", content
-                .contains("\u221a\u2022\u221a�\u221a\u2202"));
-    }
-
-    @Test
-    public void testXhtmlParsing() throws Exception {
-        String path = "/test-documents/testXHTML.html";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                HtmlParserTest.class.getResourceAsStream(path), metadata);
-
-        //can't specify charset because default differs between OS's
-        assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
-        assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
-
-        assertEquals("Tika Developers", metadata.get("Author"));
-        assertEquals("5", metadata.get("refresh"));
-        assertContains("ability of Apache Tika", content);
-        assertContains("extract content", content);
-        assertContains("an XHTML document", content);
-    }
-
-    @Test
-    public void testParseEmpty() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(new byte[0]),
-                handler, new Metadata(), new ParseContext());
-        assertEquals("", handler.toString());
-    }
-
-    /**
-     * Test case for TIKA-210
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
-     */
-    @Test
-    public void testCharactersDirectlyUnderBodyElement() throws Exception {
-        String test = "<html><body>test</body></html>";
-        String content = new Tika().parseToString(
-                new ByteArrayInputStream(test.getBytes(UTF_8)));
-        assertEquals("test", content);
-    }
-
-    /**
-     * Test case for TIKA-287
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
-     */
-    @Test
-    public void testBaseHref() throws Exception {
-        assertRelativeLink(
-                "http://lucene.apache.org/tika/",
-                "http://lucene.apache.org/", "tika/");
-
-        assertRelativeLink(
-                "http://domain.com/?pid=1",
-                "http://domain.com", "?pid=1");
-        assertRelativeLink(
-                "http://domain.com/?pid=2",
-                "http://domain.com?pid=1", "?pid=2");
-
-        assertRelativeLink(
-                "http://domain.com/file.html",
-                "http://domain.com/path/", "/file.html");
-        assertRelativeLink(
-                "http://domain.com/path/file.html",
-                "http://domain.com/path/", "./file.html");
-        assertRelativeLink(
-                "http://domain.com/path/file.html",
-                "http://domain.com/path/", "file.html");
-
-        assertRelativeLink(
-                "http://domain2.com/newpath",
-                "http://domain.com/path/to/file", "http://domain2.com/newpath");
-
-        // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
-        // Also http://www.ietf.org/rfc/rfc3986.txt
-        // Also http://issues.apache.org/jira/browse/NUTCH-566
-        // Also http://issues.apache.org/jira/browse/NUTCH-436
-        assertRelativeLink(
-                "http://domain.com/path/?pid=1",
-                "http://domain.com/path/", "?pid=1");
-        assertRelativeLink(
-                "http://domain.com/file?pid=1",
-                "http://domain.com/file", "?pid=1");
-        assertRelativeLink(
-                "http://domain.com/path/d;p?pid=1",
-                "http://domain.com/path/d;p?q#f", "?pid=1");
-    }
-
-    private void assertRelativeLink(String url, String base, String relative)
-            throws Exception {
-        String test =
-                "<html><head><base href=\"" + base + "\"></head>"
-                        + "<body><a href=\"" + relative + "\">test</a></body></html>";
-        final List<String> links = new ArrayList<String>();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new DefaultHandler() {
-                    @Override
-                    public void startElement(
-                            String u, String l, String name, Attributes atts) {
-                        if (name.equals("a") && atts.getValue("", "href") != null) {
-                            links.add(atts.getValue("", "href"));
-                        }
-                    }
-                },
-                new Metadata(),
-                new ParseContext());
-        assertEquals(1, links.size());
-        assertEquals(url, links.get(0));
-    }
-
-    /**
-     * Test case for TIKA-268
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
-     */
-    @Test
-    public void testWhitespaceBetweenTableCells() throws Exception {
-        String test =
-                "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
-        String content = new Tika().parseToString(
-                new ByteArrayInputStream(test.getBytes(UTF_8)));
-        assertContains("a", content);
-        assertContains("b", content);
-        assertFalse(content.contains("ab"));
-    }
-
-    /**
-     * Test case for TIKA-332
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
-     */
-    @Test
-    public void testHttpEquivCharset() throws Exception {
-        String test =
-                "<html><head><meta http-equiv=\"content-type\""
-                        + " content=\"text/html; charset=ISO-8859-1\" />"
-                        + "<title>the name is \u00e1ndre</title>"
-                        + "</head><body></body></html>";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-    /**
-     * Test case for TIKA-892
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
-     */
-    @Test
-    public void testHtml5Charset() throws Exception {
-        String test =
-                "<html><head><meta charset=\"ISO-8859-15\" />"
-                        + "<title>the name is \u00e1ndre</title>"
-                        + "</head><body></body></html>";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-    /**
-     * Test case for TIKA-334
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
-     */
-    @Test
-    public void testDetectOfCharset() throws Exception {
-        String test =
-                "<html><head><title>\u017d</title></head><body></body></html>";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
-    }
-
-    /**
-     * Test case for TIKA-341
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
-     */
-    @Test
-    public void testUsingCharsetInContentTypeHeader() throws Exception {
-        final String test =
-                "<html><head><title>the name is \u00e1ndre</title></head>"
-                        + "<body></body></html>";
-
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
-
-        metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-    /**
-     * Test case for HTML content like
-     * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
-     * in three whitespace-separated tokens "foo", "bar" and "baz" instead
-     * of a single token "foobarbaz".
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
-     */
-    @Test
-    public void testLineBreak() throws Exception {
-        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
-        String text = new Tika().parseToString(
-                new ByteArrayInputStream(test.getBytes(US_ASCII)));
-        String[] parts = text.trim().split("\\s+");
-        assertEquals(3, parts.length);
-        assertEquals("foo", parts[0]);
-        assertEquals("bar", parts[1]);
-        assertEquals("baz", parts[2]);
-    }
-
-    /**
-     * Test case for TIKA-339: Don't use language returned by CharsetDetector
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
-     */
-    @Test
-    public void testIgnoreCharsetDetectorLanguage() throws Exception {
-        String test = "<html><title>Simple Content</title><body></body></html>";
-        Metadata metadata = new Metadata();
-        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
-    }
-
-    /**
-     * Test case for TIKA-349
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
-     */
-    @Test
-    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
-        String test1 =
-                "<html><head><meta http-equiv=\"content-type\""
-                        + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />"
-                        + "<title>the name is \u00e1ndre</title>"
-                        + "</head><body></body></html>";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
-
-        // Some HTML pages have errors like ';;' versus '; ' as separator
-        String test2 =
-                "<html><head><meta http-equiv=\"content-type\""
-                        + " content=\"text/html;;charset=ISO-8859-15\" />"
-                        + "<title>the name is \u00e1ndre</title>"
-                        + "</head><body></body></html>";
-        metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-    /**
-     * Test case for TIKA-350
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
-     */
-    @Test
-    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
-        final String test =
-                "<html><head><title>the name is \u00e1ndre</title></head>"
-                        + "<body></body></html>";
-
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
-
-        metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-
-    /**
-     * Test case for TIKA-357
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
-     */
-    @Test
-    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
-        String path = "/test-documents/big-preamble.html";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                HtmlParserTest.class.getResourceAsStream(path),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
-    }
-
-    /**
-     * Test case for TIKA-420
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
-     */
-    @Test
-    public void testBoilerplateRemoval() throws Exception {
-        String path = "/test-documents/boilerplate.html";
-
-        Metadata metadata = new Metadata();
-        BodyContentHandler handler = new BodyContentHandler();
-        new HtmlParser().parse(
-                HtmlParserTest.class.getResourceAsStream(path),
-                new BoilerpipeContentHandler(handler), metadata, new ParseContext());
-
-        String content = handler.toString();
-        assertTrue(content.startsWith("This is the real meat"));
-        assertTrue(content.endsWith("This is the end of the text.\n"));
-        assertFalse(content.contains("boilerplate"));
-        assertFalse(content.contains("footer"));
-    }
-
-    /**
-     * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
-     */
-    @Test
-    public void testElementOrdering() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<meta http-equiv=\"content-type\" content=\"text/html\">" +
-                "<link rel=\"next\" href=\"next.html\" />" +
-                "</head><body><p>Simple Content</p></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // Title element in <head> section
-        assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
-
-        // No meta elements in body
-        assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
-
-        // meta elements should show up in <head> section
-        assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
-
-        // No link elements in body
-        assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
-
-        // link element should be in <head> section
-        assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
-
-        // There should be ending elements.
-        assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
-
-    }
-
-    /**
-     * Test case for TIKA-463. Don't skip elements that have URLs.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testImgUrlExtraction() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><body><img src=\"image.jpg\" /></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <img> tag should exist, with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
-    }
-
-    /**
-     * Test case for TIKA-463. Don't skip elements that have URLs.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testFrameSrcExtraction() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <frame> tag should exist, with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
-    }
-
-    /**
-     * Test case for TIKA-463. Don't skip elements that have URLs.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testIFrameSrcExtraction() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
-                "<p>Your browser doesn't support iframes!</p></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <iframe> tag should exist, with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
-    }
-
-    /**
-     * Test case for TIKA-463. Don't skip elements that have URLs.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testAreaExtraction() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><body><p><map name=\"map\" id=\"map\">" +
-                "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
-                "</map></p></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <map> tag should exist, with <area> tag with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
-    }
-
-    /**
-     * Test case for TIKA-463. Don't skip elements that have URLs.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testObjectExtraction() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
-                "<param name=\"name\" value=\"value\" />" +
-                "</object></p></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <object> tag should exist with fully resolved URLs
-        assertTrue(
-                "<object> tag not correctly found in:\n" + result,
-                Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result)
-        );
-    }
-
-    /**
-     * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
-     */
-    @Test
-    public void testMetaTagHandling() throws Exception {
-        final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";
-
-        Metadata metadata = new Metadata();
-        metadata.add("Content-Type", "text/html; charset=utf-8");
-        metadata.add("Language", null);
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), metadata, new ParseContext());
-
-        String result = sw.toString();
-
-        // <meta> tag for Content-Type should exist, but nothing for Language
-        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
-        assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
-    }
-
-    /**
-     * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
-     */
-    @Test
-    public void testBrokenFrameset() throws Exception {
-        final String test1 = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";
-
-        StringWriter sw1 = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test1.getBytes(UTF_8)),
-                makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
-
-        String result = sw1.toString();
-
-        // <frame> tag should exist, with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
-
-        // <body> tag should not exist.
-        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
-
-        // Test the example from the Nutch project.
-        final String test2 = "<html><head><title> my title </title></head><body>" +
-                "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
-                "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
-                "<frame src=\"invalid.html\"/></frame>" +
-                "<frame src=\"right.html\"></frame>" +
-                "</frameset></frameset></body></html>";
-
-        StringWriter sw2 = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test2.getBytes(UTF_8)),
-                makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
-
-        result = sw2.toString();
-
-        // <frame> tags should exist, with relative URL (no base element specified)
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
-        assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));
-
-        // <body> tag should not exist.
-        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
-    }
-
-    /**
-     * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
-     * as delegate for BoilerpipeContentHandler
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
-     */
-    @Test
-    public void testBoilerplateDelegation() throws Exception {
-        String path = "/test-documents/boilerplate.html";
-
-        Metadata metadata = new Metadata();
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                HtmlParserTest.class.getResourceAsStream(path),
-                makeHtmlTransformer(sw), metadata, new ParseContext());
-
-        String content = sw.toString();
-
-        // Should have <html>, <head>, <title>, <body> elements
-        assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
-        assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
-        assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
-        assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
-    }
-
-    /**
-     * Test case for TIKA-481. Verify href in <link> is resolved.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
-     */
-    @Test
-    public void testLinkHrefResolution() throws Exception {
-        final String test = "<html><head><title>Title</title>" +
-                "<base href=\"http://domain.com\" />" +
-                "<link rel=\"next\" href=\"next.html\" />" +
-                "</head><body></body></html>";
-
-        StringWriter sw = new StringWriter();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
-        String result = sw.toString();
-
-        // <link> tag should exist in <head>, with fully resolved URL
-        assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
-    }
-
-
-    /**
-     * Create ContentHandler that transforms SAX events into textual HTML output,
-     * and writes it out to <writer> - typically this is a StringWriter.
-     *
-     * @param writer Where to write resulting HTML text.
-     * @return ContentHandler suitable for passing to parse() methods.
-     * @throws Exception
-     */
-    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
-        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
-        handler.setResult(new StreamResult(writer));
-        return handler;
-    }
-
-    /**
-     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
-     */
-    @Test
-    public void testBoilerplateWithMarkup() throws Exception {
-        String path = "/test-documents/boilerplate.html";
-
-        Metadata metadata = new Metadata();
-        StringWriter sw = new StringWriter();
-        ContentHandler ch = makeHtmlTransformer(sw);
-        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
-        bpch.setIncludeMarkup(true);
-
-        new HtmlParser().parse(
-                HtmlParserTest.class.getResourceAsStream(path),
-                bpch, metadata, new ParseContext());
-
-        String content = sw.toString();
-        assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
-        assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
-        assertTrue("Has real content", content.contains("<p>This is the real meat"));
-        assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
-        assertFalse(content.contains("boilerplate"));
-        assertFalse(content.contains("footer"));
-    }
-
-    /**
-     * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
-     */
-    @Test
-    public void testPushback() throws IOException, TikaException {
-        String content = new Tika().parseToString(
-                HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());
-        assertNotNull(content);
-    }
-
-    /**
-     * Test case for TIKA-869
-     * IdentityHtmlMapper needs to lower-case tag names.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
-     */
-    @Test
-    public void testIdentityMapper() throws Exception {
-        final String html = "<html><head><title>Title</title></head>" +
-                "<body></body></html>";
-        Metadata metadata = new Metadata();
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
-
-        StringWriter sw = new StringWriter();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(html.getBytes(UTF_8)),
-                makeHtmlTransformer(sw), metadata, parseContext);
-
-        String result = sw.toString();
-        // Make sure we don't get <body><BODY/></body>
-        assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
-    }
-
-    /**
-     * Test case for TIKA-889
-     * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
-     */
-    @Test
-    public void testNewlineAndIndent() throws Exception {
-        final String html = "<html><head><title>Title</title></head>" +
-                "<body><ul><li>one</li></ul></body></html>";
-
-        BodyContentHandler handler = new BodyContentHandler();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(html.getBytes(UTF_8)),
-                handler, new Metadata(), new ParseContext());
-
-        // Make sure we get <tab>, "one", newline, newline
-        String result = handler.toString();
-
-        assertTrue(Pattern.matches("\tone\n\n", result));
-    }
-
-    /**
-     * Test case for TIKA-961
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
-     */
-    @Test
-    public void testBoilerplateWhitespace() throws Exception {
-        String path = "/test-documents/boilerplate-whitespace.html";
-
-        Metadata metadata = new Metadata();
-        BodyContentHandler handler = new BodyContentHandler();
-
-        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
-        bpHandler.setIncludeMarkup(true);
-
-        new HtmlParser().parse(
-                HtmlParserTest.class.getResourceAsStream(path),
-                bpHandler, metadata, new ParseContext());
-
-        String content = handler.toString();
-
-        // Should not contain item_aitem_b
-        assertFalse(content.contains("item_aitem_b"));
-
-        // Should contain the two list items with a newline in between.
-        assertContains("item_a\nitem_b", content);
-
-        // Should contain \u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684 (can i help you) without whitespace
-        assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", content);
-    }
-
-    /**
-     * Test case for TIKA-983:  HTML parser should add Open Graph meta tag data to Metadata returned by parser
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
-     */
-    @Test
-    public void testOpenGraphMetadata() throws Exception {
-        String test1 =
-                "<html><head><meta property=\"og:description\""
-                        + " content=\"some description\" />"
-                        + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
-                        + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
-                        + "<title>hello</title>"
-                        + "</head><body></body></html>";
-        Metadata metadata = new Metadata();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("some description", metadata.get("og:description"));
-        assertTrue(metadata.isMultiValued("og:image"));
-    }
-
-    // TIKA-1011
-    @Test
-    public void testUserDefinedCharset() throws Exception {
-        String content = new Tika().parseToString(
-                HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
-        assertNotNull(content);
-    }
-
-    //TIKA-1001
-    @Test
-    public void testNoisyMetaCharsetHeaders() throws Exception {
-        Tika tika = new Tika();
-        String hit = "\u0623\u0639\u0631\u0628";
-
-        for (int i = 1; i <= 4; i++) {
-            String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
-            String content = tika.parseToString(
-                    HtmlParserTest.class.getResourceAsStream(fileName));
-            assertTrue("testing: " + fileName, content.contains(hit));
-        }
-    }
-
-    // TIKA-1193
-    @Test
-    public void testCustomHtmlSchema() throws Exception {
-        // Default schema does not allow tables inside anchors
-        String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
-
-        Metadata metadata = new Metadata();
-        LinkContentHandler linkContentHandler = new LinkContentHandler();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                linkContentHandler, metadata, new ParseContext());
-
-        // Expect no anchor text
-        assertEquals("", linkContentHandler.getLinks().get(0).getText());
-
-        // We'll change the schema to allow tables inside anchors!
-        Schema schema = new HTMLSchema();
-        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(Schema.class, schema);
-        linkContentHandler = new LinkContentHandler();
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
-                linkContentHandler, metadata, parseContext);
-
-        // Expect anchor text
-        assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
-    }
-
-    /**
-     * Test case for TIKA-820:  Locator is unset for HTML parser
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-820">TIKA-820</a>
-     */
-    @Test
-    public void testLocator() throws Exception {
-        final int line = 0;
-        final int col = 1;
-        final int[] textPosition = new int[2];
-
-        new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"),
-                new ContentHandler() {
-                    Locator locator;
-
-                    public void setDocumentLocator(Locator locator) {
-                        this.locator = locator;
-                    }
-
-                    public void startDocument() throws SAXException {
-                    }
-
-                    public void endDocument() throws SAXException {
-                    }
-
-                    public void startPrefixMapping(String prefix, String uri)
-                            throws SAXException {
-                    }
-
-                    public void endPrefixMapping(String prefix)
-                            throws SAXException {
-                    }
-
-                    public void startElement(String uri, String localName,
-                                             String qName, Attributes atts) throws SAXException {
-                    }
-
-                    public void endElement(String uri, String localName,
-                                           String qName) throws SAXException {
-                    }
-
-                    public void characters(char[] ch, int start, int length)
-                            throws SAXException {
-                        String text = new String(ch, start, length);
-                        if (text.equals("Test Indexation Html") && locator != null) {
-                            textPosition[line] = locator.getLineNumber();
-                            textPosition[col] = locator.getColumnNumber();
-                        }
-                    }
-
-                    public void ignorableWhitespace(char[] ch, int start,
-                                                    int length) throws SAXException {
-                    }
-
-                    public void processingInstruction(String target, String data)
-                            throws SAXException {
-                    }
-
-                    public void skippedEntity(String name) throws SAXException {
-                    }
-                },
-                new Metadata(),
-                new ParseContext());
-
-        // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
-        assertEquals(24, textPosition[line]);
-        // The column reported seems fuzzy, just test it is close enough.
-        assertTrue(Math.abs(textPosition[col] - 47) < 10);
-    }
-
-
-    /**
-     * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data
-     * and ignore any subsequent title tags found in HTML.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-1303">TIKA-1303</a>
-     */
-    @Test
-    public void testFirstTitleValueisSetToMetadata() throws Exception {
-        String test = "<html><title>Simple Content</title><body><h1></h1>"
-                + "<title>TitleToIgnore</title></body></html>";
-        Metadata metadata = new Metadata();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        //Expecting first title to be set in meta data and second one to be ignored.
-        assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
-    }
-
-    @Test
-    public void testMisleadingMetaContentTypeTags() throws Exception {
-        //TIKA-1519
-
-        String test = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\">" +
-                "</head><title>title</title><body>body</body></html>";
-        Metadata metadata = new Metadata();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
-        test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" +
-                "</head><title>title</title><body>body</body></html>";
-        metadata = new Metadata();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
-        //test two content values
-        test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\">" +
-                "</head><title>title</title><body>body</body></html>";
-        metadata = new Metadata();
-
-        new HtmlParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("application/ms-word", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testXHTMLWithMisleading() throws Exception {
-        //first test an acceptable XHTML header with http-equiv tags
-        String test = "<?xml version=\"1.0\" ?>" +
-                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
-                "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
-                "<head>\n" +
-                "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" +
-                "<title>title</title></head><body>body</body></html>";
-        Metadata metadata = new Metadata();
-        new AutoDetectParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
-        test = "<?xml version=\"1.0\" ?>" +
-                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
-                "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
-                "<head>\n" +
-                "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" +
-                "<title>title</title></head><body>body</body></html>";
-        metadata = new Metadata();
-        new AutoDetectParser().parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
-    }
-
-    @Test
-    public void testSkippingCommentsInEncodingDetection() throws Exception {
-
-        byte[] bytes = new String("<html><head>" +
-                "<!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n" +
-                "   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"+
-                "</head>"+
-                "<body>"+
-                "\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684" +
-                "</body></html>").getBytes(StandardCharsets.UTF_8);
-        EncodingDetector htmlEncodingDetector = new HtmlEncodingDetector();
-        XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
-        assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class HtmlParserTest extends TikaTest {
+
+    @Test
+    public void testParseAscii() throws Exception {
+        String path = "/test-documents/testHTML.html";
+        final StringWriter href = new StringWriter();
+        final StringWriter name = new StringWriter();
+        ContentHandler body = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
+            ContentHandler link = new DefaultHandler() {
+                @Override
+                public void startElement(
+                        String u, String l, String n, Attributes a)
+                        throws SAXException {
+                    if ("a".equals(l)) {
+                        if (a.getValue("href") != null) {
+                            href.append(a.getValue("href"));
+                        } else if (a.getValue("name") != null) {
+                            name.append(a.getValue("name"));
+                        }
+                    }
+                }
+            };
+            new HtmlParser().parse(
+                    stream, new TeeContentHandler(body, link),
+                    metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
+
+        assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
+        assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
+
+        assertEquals("http://www.apache.org/", href.toString());
+        assertEquals("test-anchor", name.toString());
+
+        String content = body.toString();
+        assertTrue(
+                "Did not contain expected text:" + "Test Indexation Html",
+                content.contains("Test Indexation Html"));
+        assertTrue(
+                "Did not contain expected text:" + "Indexation du fichier",
+                content.contains("Indexation du fichier"));
+    }
+
+    @Test
+    @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
+    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
+        String path = "/test-documents/testXHTML_utf8.html";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+        assertTrue("Did not contain expected text:"
+                + "Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022", content
+                .contains("Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022"));
+
+        assertTrue("Did not contain expected text:"
+                + "Content with UTF-8 chars", content
+                .contains("Content with UTF-8 chars"));
+
+        assertTrue("Did not contain expected text:" + "\u221a\u2022\u221a�\u221a\u2202", content
+                .contains("\u221a\u2022\u221a�\u221a\u2202"));
+    }
+
+    @Test
+    public void testXhtmlParsing() throws Exception {
+        String path = "/test-documents/testXHTML.html";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+        //can't specify charset because default differs between OS's
+        assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
+        assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
+        assertContains("ability of Apache Tika", content);
+        assertContains("extract content", content);
+        assertContains("an XHTML document", content);
+    }
+
+    @Test
+    public void testParseEmpty() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(new byte[0]),
+                handler, new Metadata(), new ParseContext());
+        assertEquals("", handler.toString());
+    }
+
+    /**
+     * Test case for TIKA-210
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
+     */
+    @Test
+    public void testCharactersDirectlyUnderBodyElement() throws Exception {
+        String test = "<html><body>test</body></html>";
+        String content = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(UTF_8)));
+        assertEquals("test", content);
+    }
+
+    /**
+     * Test case for TIKA-287
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
+     */
+    @Test
+    public void testBaseHref() throws Exception {
+        assertRelativeLink(
+                "http://lucene.apache.org/tika/",
+                "http://lucene.apache.org/", "tika/");
+
+        assertRelativeLink(
+                "http://domain.com/?pid=1",
+                "http://domain.com", "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/?pid=2",
+                "http://domain.com?pid=1", "?pid=2");
+
+        assertRelativeLink(
+                "http://domain.com/file.html",
+                "http://domain.com/path/", "/file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html",
+                "http://domain.com/path/", "./file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html",
+                "http://domain.com/path/", "file.html");
+
+        assertRelativeLink(
+                "http://domain2.com/newpath",
+                "http://domain.com/path/to/file", "http://domain2.com/newpath");
+
+        // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
+        // Also http://www.ietf.org/rfc/rfc3986.txt
+        // Also http://issues.apache.org/jira/browse/NUTCH-566
+        // Also http://issues.apache.org/jira/browse/NUTCH-436
+        assertRelativeLink(
+                "http://domain.com/path/?pid=1",
+                "http://domain.com/path/", "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/file?pid=1",
+                "http://domain.com/file", "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/path/d;p?pid=1",
+                "http://domain.com/path/d;p?q#f", "?pid=1");
+    }
+
+    private void assertRelativeLink(String url, String base, String relative)
+            throws Exception {
+        String test =
+                "<html><head><base href=\"" + base + "\"></head>"
+                        + "<body><a href=\"" + relative + "\">test</a></body></html>";
+        final List<String> links = new ArrayList<String>();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new DefaultHandler() {
+                    @Override
+                    public void startElement(
+                            String u, String l, String name, Attributes atts) {
+                        if (name.equals("a") && atts.getValue("", "href") != null) {
+                            links.add(atts.getValue("", "href"));
+                        }
+                    }
+                },
+                new Metadata(),
+                new ParseContext());
+        assertEquals(1, links.size());
+        assertEquals(url, links.get(0));
+    }
+
+    /**
+     * Test case for TIKA-268
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
+     */
+    @Test
+    public void testWhitespaceBetweenTableCells() throws Exception {
+        String test =
+                "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
+        String content = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(UTF_8)));
+        assertContains("a", content);
+        assertContains("b", content);
+        assertFalse(content.contains("ab"));
+    }
+
+    /**
+     * Test case for TIKA-332
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
+     */
+    @Test
+    public void testHttpEquivCharset() throws Exception {
+        String test =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html; charset=ISO-8859-1\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-892
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
+     */
+    @Test
+    public void testHtml5Charset() throws Exception {
+        String test =
+                "<html><head><meta charset=\"ISO-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-334
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
+     */
+    @Test
+    public void testDetectOfCharset() throws Exception {
+        String test =
+                "<html><head><title>\u017d</title></head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    /**
+     * Test case for TIKA-341
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+     */
+    @Test
+    public void testUsingCharsetInContentTypeHeader() throws Exception {
+        final String test =
+                "<html><head><title>the name is \u00e1ndre</title></head>"
+                        + "<body></body></html>";
+
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for HTML content like
+     * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
+     * in three whitespace-separated tokens "foo", "bar" and "baz" instead
+     * of a single token "foobarbaz".
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
+     */
+    @Test
+    public void testLineBreak() throws Exception {
+        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
+        String text = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes(US_ASCII)));
+        String[] parts = text.trim().split("\\s+");
+        assertEquals(3, parts.length);
+        assertEquals("foo", parts[0]);
+        assertEquals("bar", parts[1]);
+        assertEquals("baz", parts[2]);
+    }
+
+    /**
+     * Test case for TIKA-339: Don't use language returned by CharsetDetector
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
+     */
+    @Test
+    public void testIgnoreCharsetDetectorLanguage() throws Exception {
+        String test = "<html><title>Simple Content</title><body></body></html>";
+        Metadata metadata = new Metadata();
+        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+    }
+
+    /**
+     * Test case for TIKA-349
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
+     */
+    @Test
+    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+        String test1 =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+
+        // Some HTML pages have errors like ';;' versus '; ' as separator
+        String test2 =
+                "<html><head><meta http-equiv=\"content-type\""
+                        + " content=\"text/html;;charset=ISO-8859-15\" />"
+                        + "<title>the name is \u00e1ndre</title>"
+                        + "</head><body></body></html>";
+        metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-350
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
+     */
+    @Test
+    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
+        final String test =
+                "<html><head><title>the name is \u00e1ndre</title></head>"
+                        + "<body></body></html>";
+
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+
+    /**
+     * Test case for TIKA-357
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
+     */
+    @Test
+    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
+        String path = "/test-documents/big-preamble.html";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
+     * Test case for TIKA-420
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
+     */
+    @Test
+    public void testBoilerplateRemoval() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                new BoilerpipeContentHandler(handler), metadata, new ParseContext());
+
+        String content = handler.toString();
+        assertTrue(content.startsWith("This is the real meat"));
+        assertTrue(content.endsWith("This is the end of the text.\n"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
+     */
+    @Test
+    public void testElementOrdering() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<meta http-equiv=\"content-type\" content=\"text/html\">" +
+                "<link rel=\"next\" href=\"next.html\" />" +
+                "</head><body><p>Simple Content</p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // Title element in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
+
+        // No meta elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
+
+        // meta elements should show up in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
+
+        // No link elements in body
+        assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
+
+        // link element should be in <head> section
+        assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
+
+        // There should be ending elements.
+        assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
+
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testImgUrlExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><body><img src=\"image.jpg\" /></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <img> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testFrameSrcExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testIFrameSrcExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
+                "<p>Your browser doesn't support iframes!</p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <iframe> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testAreaExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><body><p><map name=\"map\" id=\"map\">" +
+                "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
+                "</map></p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <map> tag should exist, with <area> tag with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-463. Don't skip elements that have URLs.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testObjectExtraction() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
+                "<param name=\"name\" value=\"value\" />" +
+                "</object></p></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <object> tag should exist with fully resolved URLs
+        assertTrue(
+                "<object> tag not correctly found in:\n" + result,
+                Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result)
+        );
+    }
+
+    /**
+     * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+     */
+    @Test
+    public void testMetaTagHandling() throws Exception {
+        final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";
+
+        Metadata metadata = new Metadata();
+        metadata.add("Content-Type", "text/html; charset=utf-8");
+        metadata.add("Language", null);
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), metadata, new ParseContext());
+
+        String result = sw.toString();
+
+        // <meta> tag for Content-Type should exist, but nothing for Language
+        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
+        assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
+    }
+
+    /**
+     * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
+     */
+    @Test
+    public void testBrokenFrameset() throws Exception {
+        final String test1 = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";
+
+        StringWriter sw1 = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(UTF_8)),
+                makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
+
+        String result = sw1.toString();
+
+        // <frame> tag should exist, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
+
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+
+        // Test the example from the Nutch project.
+        final String test2 = "<html><head><title> my title </title></head><body>" +
+                "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
+                "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
+                "<frame src=\"invalid.html\"/></frame>" +
+                "<frame src=\"right.html\"></frame>" +
+                "</frameset></frameset></body></html>";
+
+        StringWriter sw2 = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test2.getBytes(UTF_8)),
+                makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
+
+        result = sw2.toString();
+
+        // <frame> tags should exist, with relative URL (no base element specified)
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
+        assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));
+
+        // <body> tag should not exist.
+        assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
+     * as delegate for BoilerpipeContentHandler
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
+     */
+    @Test
+    public void testBoilerplateDelegation() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                makeHtmlTransformer(sw), metadata, new ParseContext());
+
+        String content = sw.toString();
+
+        // Should have <html>, <head>, <title>, <body> elements
+        assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
+        assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
+    }
+
+    /**
+     * Test case for TIKA-481. Verify href in <link> is resolved.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
+     */
+    @Test
+    public void testLinkHrefResolution() throws Exception {
+        final String test = "<html><head><title>Title</title>" +
+                "<base href=\"http://domain.com\" />" +
+                "<link rel=\"next\" href=\"next.html\" />" +
+                "</head><body></body></html>";
+
+        StringWriter sw = new StringWriter();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+        String result = sw.toString();
+
+        // <link> tag should exist in <head>, with fully resolved URL
+        assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
+    }
+
+
+    /**
+     * Create ContentHandler that transforms SAX events into textual HTML output,
+     * and writes it out to <writer> - typically this is a StringWriter.
+     *
+     * @param writer Where to write resulting HTML text.
+     * @return ContentHandler suitable for passing to parse() methods.
+     * @throws Exception
+     */
+    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
+        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
+        handler.setResult(new StreamResult(writer));
+        return handler;
+    }
+
+    /**
+     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
+     */
+    @Test
+    public void testBoilerplateWithMarkup() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        ContentHandler ch = makeHtmlTransformer(sw);
+        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+        bpch.setIncludeMarkup(true);
+
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpch, metadata, new ParseContext());
+
+        String content = sw.toString();
+        assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
+        assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
+        assertTrue("Has real content", content.contains("<p>This is the real meat"));
+        assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
+     */
+    @Test
+    public void testPushback() throws IOException, TikaException {
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());
+        assertNotNull(content);
+    }
+
+    /**
+     * Test case for TIKA-869
+     * IdentityHtmlMapper needs to lower-case tag names.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
+     */
+    @Test
+    public void testIdentityMapper() throws Exception {
+        final String html = "<html><head><title>Title</title></head>" +
+                "<body></body></html>";
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+
+        StringWriter sw = new StringWriter();
+
+        new HtmlParser().parse(
+                new ByteArrayInputStream(html.getBytes(UTF_8)),
+                makeHtmlTransformer(sw), metadata, parseContext);
+
+        String result = sw.toString();
+        // Make sure we don't get <body><BODY/></body>
+        assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
+    }
+
+    /**
+     * Test case for TIKA-889
+     * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
+     */
+    @Test
+    public void testNewlineAndIndent() throws Exception {
+        final String html = "<html><head><title>Title</title></head>" +
+                "<body><ul><li>one</li></ul></body></html>";
+
+        BodyContentHandler handler = new BodyContentHandler();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(html.getBytes(UTF_8)),
+                handler, new Metadata(), new ParseContext());
+
+        // Make sure we get <tab>, "one", newline, newline
+        String result = handler.toString();
+
+        assertTrue(Pattern.matches("\tone\n\n", result));
+    }
+
+    /**
+     * Test case for TIKA-961
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+     */
+    @Test
+    public void testBoilerplateWhitespace() throws Exception {
+        String path = "/test-documents/boilerplate-whitespace.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+
+        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+        bpHandler.setIncludeMarkup(true);
+
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpHandler, metadata, new ParseContext());
+
+        String content = handler.toString();
+
+        // Should not contain item_aitem_b
+        assertFalse(content.contains("item_aitem_b"));
+
+        // Should contain the two list items with a newline in between.
+        assertContains("item_a\nitem_b", content);
+
+        // Should contain \u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684 (can i help you) without whitespace
+        assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", content);
+    }
+
+    /**
+     * Test case for TIKA-983:  HTML parser should add Open Graph meta tag data to Metadata returned by parser
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
+     */
+    @Test
+    public void testOpenGraphMetadata() throws Exception {
+        String test1 =
+                "<html><head><meta property=\"og:description\""
+                        + " content=\"some description\" />"
+                        + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
+                        + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
+                        + "<title>hello</title>"
+                        + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("some description", metadata.get("og:description"));
+        assertTrue(metadata.isMultiValued("og:image"));
+    }
+
+    // TIKA-1011
+    @Test
+    public void testUserDefinedCharset() throws Exception {
+        String content = new Tika().parseToString(
+                HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
+        assertNotNull(content);
+    }
+
+    //TIKA-1001
+    @Test
+    public void testNoisyMetaCharsetHeaders() throws Exception {
+        Tika tika = new Tika();
+        String hit = "\u0623\u0639\u0631\u0628";
+
+        for (int i = 1; i <= 4; i++) {
+            String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
+            String content = tika.parseToString(
+                    HtmlParserTest.class.getResourceAsStream(fileName));
+            assertTrue("testing: " + fileName, content.contains(hit));
+        }
+    }
+
+    //

<TRUNCATED>

[12/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 973d9da..443eb9e 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -1,287 +1,287 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Date;
-import java.util.Set;
-
-import org.apache.commons.compress.PasswordRequiredException;
-import org.apache.commons.compress.archivers.ArchiveEntry;
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.StreamingNotSupportedException;
-import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
-import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
-import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
-import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
-import org.apache.commons.compress.archivers.sevenz.SevenZFile;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Parser for various packaging formats. Package entries will be written to
- * the XHTML event stream as &lt;div class="package-entry"&gt; elements that
- * contain the (optional) entry name as a &lt;h1&gt; element and the full
- * structured body content of the parsed entry.
- * <p>
- * User must have JCE Unlimited Strength jars installed for encryption to
- * work with 7Z files (see: COMPRESS-299 and TIKA-1521).  If the jars
- * are not installed, an IOException will be thrown, and potentially
- * wrapped in a TikaException.
- */
-public class PackageParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -5331043266963888708L;
-
-    private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
-    private static final MediaType JAR = MediaType.application("java-archive");
-    private static final MediaType AR = MediaType.application("x-archive");
-    private static final MediaType CPIO = MediaType.application("x-cpio");
-    private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
-    private static final MediaType TAR = MediaType.application("x-tar");
-    private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
-
-    static MediaType getMediaType(ArchiveInputStream stream) {
-        if (stream instanceof JarArchiveInputStream) {
-            return JAR;
-        } else if (stream instanceof ZipArchiveInputStream) {
-            return ZIP;
-        } else if (stream instanceof ArArchiveInputStream) {
-            return AR;
-        } else if (stream instanceof CpioArchiveInputStream) {
-            return CPIO;
-        } else if (stream instanceof DumpArchiveInputStream) {
-            return DUMP;
-        } else if (stream instanceof TarArchiveInputStream) {
-            return TAR;
-        } else if (stream instanceof SevenZWrapper) {
-            return SEVENZ;
-        } else {
-            return MediaType.OCTET_STREAM;
-        }
-    }
-
-    static boolean isZipArchive(MediaType type) {
-        return type.equals(ZIP) || type.equals(JAR);
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-       
-        // Ensure that the stream supports the mark feature
-        if (! TikaInputStream.isTikaInputStream(stream))
-            stream = new BufferedInputStream(stream);
-        
-        
-        TemporaryResources tmp = new TemporaryResources();
-        ArchiveInputStream ais = null;
-        try {
-            ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
-            // At the end we want to close the archive stream to release
-            // any associated resources, but the underlying document stream
-            // should not be closed
-            ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
-            
-        } catch (StreamingNotSupportedException sne) {
-            // Most archive formats work on streams, but a few need files
-            if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
-                // Rework as a file, and wrap
-                stream.reset();
-                TikaInputStream tstream = TikaInputStream.get(stream, tmp);
-                
-                // Seven Zip suports passwords, was one given?
-                String password = null;
-                PasswordProvider provider = context.get(PasswordProvider.class);
-                if (provider != null) {
-                    password = provider.getPassword(metadata);
-                }
-                
-                SevenZFile sevenz;
-                if (password == null) {
-                    sevenz = new SevenZFile(tstream.getFile());
-                } else {
-                    sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
-                }
-                
-                // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
-                ais = new SevenZWrapper(sevenz);
-            } else {
-                tmp.close();
-                throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
-            }
-        } catch (ArchiveException e) {
-            tmp.close();
-            throw new TikaException("Unable to unpack document stream", e);
-        }
-
-        MediaType type = getMediaType(ais);
-        if (!type.equals(MediaType.OCTET_STREAM)) {
-            metadata.set(CONTENT_TYPE, type.toString());
-        }
-        // Use the delegate parser to parse the contained document
-        EmbeddedDocumentExtractor extractor = context.get(
-                EmbeddedDocumentExtractor.class,
-                new ParsingEmbeddedDocumentExtractor(context));
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        try {
-            ArchiveEntry entry = ais.getNextEntry();
-            while (entry != null) {
-                if (!entry.isDirectory()) {
-                    parseEntry(ais, entry, extractor, xhtml);
-                }
-                entry = ais.getNextEntry();
-            }
-        } catch (UnsupportedZipFeatureException zfe) {
-            // If it's an encrypted document of unknown password, report as such
-            if (zfe.getFeature() == Feature.ENCRYPTION) {
-                throw new EncryptedDocumentException(zfe);
-            }
-            // Otherwise fall through to raise the exception as normal
-        } catch (PasswordRequiredException pre) {
-            throw new EncryptedDocumentException(pre);
-        } finally {
-            ais.close();
-            tmp.close();
-        }
-
-        xhtml.endDocument();
-    }
-
-    private void parseEntry(
-            ArchiveInputStream archive, ArchiveEntry entry,
-            EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
-            throws SAXException, IOException, TikaException {
-        String name = entry.getName();
-        if (archive.canReadEntryData(entry)) {
-            // Fetch the metadata on the entry contained in the archive
-            Metadata entrydata = handleEntryMetadata(name, null, 
-                    entry.getLastModifiedDate(), entry.getSize(), xhtml);
-            
-            // Recurse into the entry if desired
-            if (extractor.shouldParseEmbedded(entrydata)) {
-                // For detectors to work, we need a mark/reset supporting
-                // InputStream, which ArchiveInputStream isn't, so wrap
-                TemporaryResources tmp = new TemporaryResources();
-                try {
-                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
-                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
-                } finally {
-                    tmp.dispose();
-                }
-            }
-        } else if (name != null && name.length() > 0) {
-            xhtml.element("p", name);
-        }
-    }
-    
-    protected static Metadata handleEntryMetadata(
-            String name, Date createAt, Date modifiedAt,
-            Long size, XHTMLContentHandler xhtml)
-            throws SAXException, IOException, TikaException {
-        Metadata entrydata = new Metadata();
-        if (createAt != null) {
-            entrydata.set(TikaCoreProperties.CREATED, createAt);
-        }
-        if (modifiedAt != null) {
-            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
-        }
-        if (size != null) {
-            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
-        }
-        if (name != null && name.length() > 0) {
-            name = name.replace("\\", "/");
-            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
-            AttributesImpl attributes = new AttributesImpl();
-            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-            attributes.addAttribute("", "id", "id", "CDATA", name);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
-
-            entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
-        }
-        return entrydata;
-    }
-
-    // Pending a fix for COMPRESS-269, we have to wrap ourselves
-    private static class SevenZWrapper extends ArchiveInputStream {
-        private SevenZFile file;
-        private SevenZWrapper(SevenZFile file) {
-            this.file = file;
-        }
-        
-        @Override
-        public int read() throws IOException {
-            return file.read();
-        }
-        @Override
-        public int read(byte[] b) throws IOException {
-            return file.read(b);
-        }
-        @Override
-        public int read(byte[] b, int off, int len) throws IOException {
-            return file.read(b, off, len);
-        }
-
-        @Override
-        public ArchiveEntry getNextEntry() throws IOException {
-            return file.getNextEntry();
-        }
-        
-        @Override
-        public void close() throws IOException {
-            file.close();
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.PasswordRequiredException;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for various packaging formats. Package entries will be written to
+ * the XHTML event stream as &lt;div class="package-entry"&gt; elements that
+ * contain the (optional) entry name as a &lt;h1&gt; element and the full
+ * structured body content of the parsed entry.
+ * <p>
+ * User must have JCE Unlimited Strength jars installed for encryption to
+ * work with 7Z files (see: COMPRESS-299 and TIKA-1521).  If the jars
+ * are not installed, an IOException will be thrown, and potentially
+ * wrapped in a TikaException.
+ */
+public class PackageParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -5331043266963888708L;
+
+    private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
+    private static final MediaType JAR = MediaType.application("java-archive");
+    private static final MediaType AR = MediaType.application("x-archive");
+    private static final MediaType CPIO = MediaType.application("x-cpio");
+    private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
+    private static final MediaType TAR = MediaType.application("x-tar");
+    private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+
+    static MediaType getMediaType(ArchiveInputStream stream) {
+        if (stream instanceof JarArchiveInputStream) {
+            return JAR;
+        } else if (stream instanceof ZipArchiveInputStream) {
+            return ZIP;
+        } else if (stream instanceof ArArchiveInputStream) {
+            return AR;
+        } else if (stream instanceof CpioArchiveInputStream) {
+            return CPIO;
+        } else if (stream instanceof DumpArchiveInputStream) {
+            return DUMP;
+        } else if (stream instanceof TarArchiveInputStream) {
+            return TAR;
+        } else if (stream instanceof SevenZWrapper) {
+            return SEVENZ;
+        } else {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    static boolean isZipArchive(MediaType type) {
+        return type.equals(ZIP) || type.equals(JAR);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+       
+        // Ensure that the stream supports the mark feature
+        if (! TikaInputStream.isTikaInputStream(stream))
+            stream = new BufferedInputStream(stream);
+        
+        
+        TemporaryResources tmp = new TemporaryResources();
+        ArchiveInputStream ais = null;
+        try {
+            ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+            // At the end we want to close the archive stream to release
+            // any associated resources, but the underlying document stream
+            // should not be closed
+            ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
+            
+        } catch (StreamingNotSupportedException sne) {
+            // Most archive formats work on streams, but a few need files
+            if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+                // Rework as a file, and wrap
+                stream.reset();
+                TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+                
+                // Seven Zip suports passwords, was one given?
+                String password = null;
+                PasswordProvider provider = context.get(PasswordProvider.class);
+                if (provider != null) {
+                    password = provider.getPassword(metadata);
+                }
+                
+                SevenZFile sevenz;
+                if (password == null) {
+                    sevenz = new SevenZFile(tstream.getFile());
+                } else {
+                    sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
+                }
+                
+                // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
+                ais = new SevenZWrapper(sevenz);
+            } else {
+                tmp.close();
+                throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+            }
+        } catch (ArchiveException e) {
+            tmp.close();
+            throw new TikaException("Unable to unpack document stream", e);
+        }
+
+        MediaType type = getMediaType(ais);
+        if (!type.equals(MediaType.OCTET_STREAM)) {
+            metadata.set(CONTENT_TYPE, type.toString());
+        }
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor extractor = context.get(
+                EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        try {
+            ArchiveEntry entry = ais.getNextEntry();
+            while (entry != null) {
+                if (!entry.isDirectory()) {
+                    parseEntry(ais, entry, extractor, xhtml);
+                }
+                entry = ais.getNextEntry();
+            }
+        } catch (UnsupportedZipFeatureException zfe) {
+            // If it's an encrypted document of unknown password, report as such
+            if (zfe.getFeature() == Feature.ENCRYPTION) {
+                throw new EncryptedDocumentException(zfe);
+            }
+            // Otherwise fall through to raise the exception as normal
+        } catch (PasswordRequiredException pre) {
+            throw new EncryptedDocumentException(pre);
+        } finally {
+            ais.close();
+            tmp.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseEntry(
+            ArchiveInputStream archive, ArchiveEntry entry,
+            EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        String name = entry.getName();
+        if (archive.canReadEntryData(entry)) {
+            // Fetch the metadata on the entry contained in the archive
+            Metadata entrydata = handleEntryMetadata(name, null, 
+                    entry.getLastModifiedDate(), entry.getSize(), xhtml);
+            
+            // Recurse into the entry if desired
+            if (extractor.shouldParseEmbedded(entrydata)) {
+                // For detectors to work, we need a mark/reset supporting
+                // InputStream, which ArchiveInputStream isn't, so wrap
+                TemporaryResources tmp = new TemporaryResources();
+                try {
+                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
+                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
+                } finally {
+                    tmp.dispose();
+                }
+            }
+        } else if (name != null && name.length() > 0) {
+            xhtml.element("p", name);
+        }
+    }
+    
+    protected static Metadata handleEntryMetadata(
+            String name, Date createAt, Date modifiedAt,
+            Long size, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        Metadata entrydata = new Metadata();
+        if (createAt != null) {
+            entrydata.set(TikaCoreProperties.CREATED, createAt);
+        }
+        if (modifiedAt != null) {
+            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+        }
+        if (size != null) {
+            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+        }
+        if (name != null && name.length() > 0) {
+            name = name.replace("\\", "/");
+            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", name);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+            entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+        }
+        return entrydata;
+    }
+
+    // Pending a fix for COMPRESS-269, we have to wrap ourselves
+    private static class SevenZWrapper extends ArchiveInputStream {
+        private SevenZFile file;
+        private SevenZWrapper(SevenZFile file) {
+            this.file = file;
+        }
+        
+        @Override
+        public int read() throws IOException {
+            return file.read();
+        }
+        @Override
+        public int read(byte[] b) throws IOException {
+            return file.read(b);
+        }
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            return file.read(b, off, len);
+        }
+
+        @Override
+        public ArchiveEntry getNextEntry() throws IOException {
+            return file.getNextEntry();
+        }
+        
+        @Override
+        public void close() throws IOException {
+            file.close();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 8276e9a..0a12e15 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -1,324 +1,324 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorInputStream;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.detect.AbstractDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.iwork.IWorkPackageParser;
-import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * A detector that works on Zip documents and other archive and compression
- * formats to figure out exactly what the file is.
- */
-public class ZipContainerDetector extends AbstractDetector {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 2891763938430295453L;
-    
-    private final Detector opcDetector;
-    
-    public ZipContainerDetector() {
-        this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
-    }
-
-    public MediaType detect(InputStream input, Metadata metadata)
-            throws IOException {
-        // Check if we have access to the document
-        if (input == null) {
-            return MediaType.OCTET_STREAM;
-        }
-
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            TikaInputStream tis = TikaInputStream.get(input, tmp);
-
-            byte[] prefix = new byte[1024]; // enough for all known formats
-            int length = tis.peek(prefix);
-
-            MediaType type = detectArchiveFormat(prefix, length);
-            if (PackageParser.isZipArchive(type)
-                    && TikaInputStream.isTikaInputStream(input)) {
-                return detectZipFormat(tis);
-            } else if (!type.equals(MediaType.OCTET_STREAM)) {
-                return type;
-            } else {
-                return detectCompressorFormat(prefix, length);
-            }
-        } finally {
-            try {
-                tmp.dispose();
-            } catch (TikaException e) {
-                // ignore
-            }
-        }
-    }
-
-    private static MediaType detectCompressorFormat(byte[] prefix, int length) {
-        try {
-            CompressorStreamFactory factory = new CompressorStreamFactory();
-            CompressorInputStream cis = factory.createCompressorInputStream(
-                    new ByteArrayInputStream(prefix, 0, length));
-            try {
-                return CompressorParser.getMediaType(cis);
-            } finally {
-                IOUtils.closeQuietly(cis);
-            }
-        } catch (CompressorException e) {
-            return MediaType.OCTET_STREAM;
-        }
-    }
-
-    private static MediaType detectArchiveFormat(byte[] prefix, int length) {
-        try {
-            ArchiveStreamFactory factory = new ArchiveStreamFactory();
-            ArchiveInputStream ais = factory.createArchiveInputStream(
-                    new ByteArrayInputStream(prefix, 0, length));
-            try {
-                if ((ais instanceof TarArchiveInputStream)
-                        && !TarArchiveInputStream.matches(prefix, length)) {
-                    // ArchiveStreamFactory is too relaxed, see COMPRESS-117
-                    return MediaType.OCTET_STREAM;
-                } else {
-                    return PackageParser.getMediaType(ais);
-                }
-            } finally {
-                IOUtils.closeQuietly(ais);
-            }
-        } catch (ArchiveException e) {
-            return MediaType.OCTET_STREAM;
-        }
-    }
-
-    private MediaType detectZipFormat(TikaInputStream tis) {
-        try {
-            ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
-            try {
-                MediaType type = detectOpenDocument(zip);
-                if (type == null) {
-                    type = detectOPCBased(zip, tis);
-                }
-                if (type == null) {
-                    type = detectIWork(zip);
-                }
-                if (type == null) {
-                    type = detectJar(zip);
-                }
-                if (type == null) {
-                    type = detectKmz(zip);
-                }
-                if (type == null) {
-                    type = detectIpa(zip);
-                }
-                if (type != null) {
-                    return type;
-                }
-            } finally {
-                // TODO: shouldn't we record the open
-                // container so it can be later
-                // reused...?
-                // tis.setOpenContainer(zip);
-                try {
-                    zip.close();
-                } catch (IOException e) {
-                    // ignore
-                }
-            }
-        } catch (IOException e) {
-            // ignore
-        }
-        // Fallback: it's still a zip file, we just don't know what kind of one
-        return MediaType.APPLICATION_ZIP;
-    }
-
-    /**
-     * OpenDocument files, along with EPub files and ASiC ones, have a 
-     *  mimetype entry in the root of their Zip file. This entry contains
-     *  the mimetype of the overall file, stored as a single string.  
-     */
-    private static MediaType detectOpenDocument(ZipFile zip) {
-        try {
-            ZipArchiveEntry mimetype = zip.getEntry("mimetype");
-            if (mimetype != null) {
-                try (InputStream stream = zip.getInputStream(mimetype)) {
-                    return MediaType.parse(IOUtils.toString(stream, UTF_8));
-                }
-            } else {
-                return null;
-            }
-        } catch (IOException e) {
-            return null;
-        }
-    }
-
-    private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
-        try {
-            if (zip.getEntry("_rels/.rels") != null
-                    || zip.getEntry("[Content_Types].xml") != null) {
-                MediaType type = this.opcDetector.detect(stream, null);
-                if (type != null) return type;
-                
-                // We don't know what it is, sorry
-                return null;
-            } else {
-                return null;
-            }
-        } catch (IOException e) {
-            return null;
-        } catch (RuntimeException e) {
-            return null;
-        }
-    }
-    
-
-    private static MediaType detectIWork(ZipFile zip) {
-        if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
-            // Locate the appropriate index file entry, and reads from that
-            // the root element of the document. That is used to the identify
-            // the correct type of the keynote container.
-            for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
-               IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip); 
-               if (type != null) {
-                  return type.getType();
-               }
-            }
-            
-            // Not sure, fallback to the container type
-            return MediaType.application("vnd.apple.iwork");
-        } else {
-            return null;
-        }
-    }
-    
-    private static MediaType detectJar(ZipFile zip) {
-       if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
-          // It's a Jar file, or something based on Jar
-          
-          // Is it an Android APK?
-          if (zip.getEntry("AndroidManifest.xml") != null) {
-             return MediaType.application("vnd.android.package-archive");
-          }
-          
-          // Check for WAR and EAR
-          if (zip.getEntry("WEB-INF/") != null) {
-             return MediaType.application("x-tika-java-web-archive");
-          }
-          if (zip.getEntry("META-INF/application.xml") != null) {
-             return MediaType.application("x-tika-java-enterprise-archive");
-          }
-          
-          // Looks like a regular Jar Archive
-          return MediaType.application("java-archive");
-       } else {
-          // Some Android APKs miss the default Manifest
-          if (zip.getEntry("AndroidManifest.xml") != null) {
-             return MediaType.application("vnd.android.package-archive");
-          }
-          
-          return null;
-       }
-    }
-
-    private static MediaType detectKmz(ZipFile zip) {
-        boolean kmlFound = false;
-
-        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
-        while (entries.hasMoreElements()) {
-            ZipArchiveEntry entry = entries.nextElement();
-            String name = entry.getName();
-            if (!entry.isDirectory()
-                    && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
-                if (name.endsWith(".kml") && !kmlFound) {
-                    kmlFound = true;
-                } else {
-                    return null;
-                }
-            }
-        }
-
-        if (kmlFound) {
-            return MediaType.application("vnd.google-earth.kmz");
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * To be considered as an IPA file, it needs to match all of these
-     */
-    private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
-        private static final long serialVersionUID = 6545295886322115362L;
-        {
-           add(Pattern.compile("^Payload/$"));
-           add(Pattern.compile("^Payload/.*\\.app/$"));
-           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
-           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
-           add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
-           add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
-    }};
-    @SuppressWarnings("unchecked")
-    private static MediaType detectIpa(ZipFile zip) {
-        // Note - consider generalising this logic, if another format needs many regexp matching
-        Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
-        
-        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
-        while (entries.hasMoreElements()) {
-            ZipArchiveEntry entry = entries.nextElement();
-            String name = entry.getName();
-            
-            Iterator<Pattern> ip = tmpPatterns.iterator();
-            while (ip.hasNext()) {
-                if (ip.next().matcher(name).matches()) {
-                    ip.remove();
-                }
-            }
-            if (tmpPatterns.isEmpty()) {
-                // We've found everything we need to find
-                return MediaType.application("x-itunes-ipa");
-            }
-        }
-        
-        // If we get here, not all required entries were found
-        return null;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.detect.AbstractDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * A detector that works on Zip documents and other archive and compression
+ * formats to figure out exactly what the file is.
+ */
+public class ZipContainerDetector extends AbstractDetector {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 2891763938430295453L;
+    
+    private final Detector opcDetector;
+    
+    public ZipContainerDetector() {
+        this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        // Check if we have access to the document
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(input, tmp);
+
+            byte[] prefix = new byte[1024]; // enough for all known formats
+            int length = tis.peek(prefix);
+
+            MediaType type = detectArchiveFormat(prefix, length);
+            if (PackageParser.isZipArchive(type)
+                    && TikaInputStream.isTikaInputStream(input)) {
+                return detectZipFormat(tis);
+            } else if (!type.equals(MediaType.OCTET_STREAM)) {
+                return type;
+            } else {
+                return detectCompressorFormat(prefix, length);
+            }
+        } finally {
+            try {
+                tmp.dispose();
+            } catch (TikaException e) {
+                // ignore
+            }
+        }
+    }
+
+    private static MediaType detectCompressorFormat(byte[] prefix, int length) {
+        try {
+            CompressorStreamFactory factory = new CompressorStreamFactory();
+            CompressorInputStream cis = factory.createCompressorInputStream(
+                    new ByteArrayInputStream(prefix, 0, length));
+            try {
+                return CompressorParser.getMediaType(cis);
+            } finally {
+                IOUtils.closeQuietly(cis);
+            }
+        } catch (CompressorException e) {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+        try {
+            ArchiveStreamFactory factory = new ArchiveStreamFactory();
+            ArchiveInputStream ais = factory.createArchiveInputStream(
+                    new ByteArrayInputStream(prefix, 0, length));
+            try {
+                if ((ais instanceof TarArchiveInputStream)
+                        && !TarArchiveInputStream.matches(prefix, length)) {
+                    // ArchiveStreamFactory is too relaxed, see COMPRESS-117
+                    return MediaType.OCTET_STREAM;
+                } else {
+                    return PackageParser.getMediaType(ais);
+                }
+            } finally {
+                IOUtils.closeQuietly(ais);
+            }
+        } catch (ArchiveException e) {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
+    private MediaType detectZipFormat(TikaInputStream tis) {
+        try {
+            ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+            try {
+                MediaType type = detectOpenDocument(zip);
+                if (type == null) {
+                    type = detectOPCBased(zip, tis);
+                }
+                if (type == null) {
+                    type = detectIWork(zip);
+                }
+                if (type == null) {
+                    type = detectJar(zip);
+                }
+                if (type == null) {
+                    type = detectKmz(zip);
+                }
+                if (type == null) {
+                    type = detectIpa(zip);
+                }
+                if (type != null) {
+                    return type;
+                }
+            } finally {
+                // TODO: shouldn't we record the open
+                // container so it can be later
+                // reused...?
+                // tis.setOpenContainer(zip);
+                try {
+                    zip.close();
+                } catch (IOException e) {
+                    // ignore
+                }
+            }
+        } catch (IOException e) {
+            // ignore
+        }
+        // Fallback: it's still a zip file, we just don't know what kind of one
+        return MediaType.APPLICATION_ZIP;
+    }
+
+    /**
+     * OpenDocument files, along with EPub files and ASiC ones, have a 
+     *  mimetype entry in the root of their Zip file. This entry contains
+     *  the mimetype of the overall file, stored as a single string.  
+     */
+    private static MediaType detectOpenDocument(ZipFile zip) {
+        try {
+            ZipArchiveEntry mimetype = zip.getEntry("mimetype");
+            if (mimetype != null) {
+                try (InputStream stream = zip.getInputStream(mimetype)) {
+                    return MediaType.parse(IOUtils.toString(stream, UTF_8));
+                }
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+        try {
+            if (zip.getEntry("_rels/.rels") != null
+                    || zip.getEntry("[Content_Types].xml") != null) {
+                MediaType type = this.opcDetector.detect(stream, null);
+                if (type != null) return type;
+                
+                // We don't know what it is, sorry
+                return null;
+            } else {
+                return null;
+            }
+        } catch (IOException e) {
+            return null;
+        } catch (RuntimeException e) {
+            return null;
+        }
+    }
+    
+
+    private static MediaType detectIWork(ZipFile zip) {
+        if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
+            // Locate the appropriate index file entry, and reads from that
+            // the root element of the document. That is used to the identify
+            // the correct type of the keynote container.
+            for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
+               IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip); 
+               if (type != null) {
+                  return type.getType();
+               }
+            }
+            
+            // Not sure, fallback to the container type
+            return MediaType.application("vnd.apple.iwork");
+        } else {
+            return null;
+        }
+    }
+    
+    private static MediaType detectJar(ZipFile zip) {
+       if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
+          // It's a Jar file, or something based on Jar
+          
+          // Is it an Android APK?
+          if (zip.getEntry("AndroidManifest.xml") != null) {
+             return MediaType.application("vnd.android.package-archive");
+          }
+          
+          // Check for WAR and EAR
+          if (zip.getEntry("WEB-INF/") != null) {
+             return MediaType.application("x-tika-java-web-archive");
+          }
+          if (zip.getEntry("META-INF/application.xml") != null) {
+             return MediaType.application("x-tika-java-enterprise-archive");
+          }
+          
+          // Looks like a regular Jar Archive
+          return MediaType.application("java-archive");
+       } else {
+          // Some Android APKs miss the default Manifest
+          if (zip.getEntry("AndroidManifest.xml") != null) {
+             return MediaType.application("vnd.android.package-archive");
+          }
+          
+          return null;
+       }
+    }
+
+    private static MediaType detectKmz(ZipFile zip) {
+        boolean kmlFound = false;
+
+        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+        while (entries.hasMoreElements()) {
+            ZipArchiveEntry entry = entries.nextElement();
+            String name = entry.getName();
+            if (!entry.isDirectory()
+                    && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
+                if (name.endsWith(".kml") && !kmlFound) {
+                    kmlFound = true;
+                } else {
+                    return null;
+                }
+            }
+        }
+
+        if (kmlFound) {
+            return MediaType.application("vnd.google-earth.kmz");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * To be considered as an IPA file, it needs to match all of these
+     */
+    private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+        private static final long serialVersionUID = 6545295886322115362L;
+        {
+           add(Pattern.compile("^Payload/$"));
+           add(Pattern.compile("^Payload/.*\\.app/$"));
+           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+           add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+           add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+           add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+    }};
+    @SuppressWarnings("unchecked")
+    private static MediaType detectIpa(ZipFile zip) {
+        // Note - consider generalising this logic, if another format needs many regexp matching
+        Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+        
+        Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+        while (entries.hasMoreElements()) {
+            ZipArchiveEntry entry = entries.nextElement();
+            String name = entry.getName();
+            
+            Iterator<Pattern> ip = tmpPatterns.iterator();
+            while (ip.hasNext()) {
+                if (ip.next().matcher(name).matches()) {
+                    ip.remove();
+                }
+            }
+            if (tmpPatterns.isEmpty()) {
+                // We've found everything we need to find
+                return MediaType.application("x-itunes-ipa");
+            }
+        }
+        
+        // If we get here, not all required entries were found
+        return null;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
index e7625b4..25dfc44 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.Test;
-
-/**
- * Test class for the <code>AutoPageNumberUtils</code> helper class.
- */
-public class AutoPageNumberUtilsTest {
-
-	/**
-	 * Check upper-case alpha-numeric numbers are generated based on the 
-	 * input page number.
-	 */
-    @Test
-	public void testAlphaUpper() {
-		assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
-		assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
-		assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
-		assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
-		assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
-		assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
-	}
-
-	/**
-	 * Check lower-case alpha-numeric numbers are generated based on the 
-	 * input page number.
-	 */
-    @Test
-	public void testAlphaLower() {
-		assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
-		assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
-		assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
-		assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
-		assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
-		assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
-	}
-
-	/**
-	 * Check upper-case Roman numerals numbers are generated based on the 
-	 * input page number.
-	 */
-    @Test
-	public void testRomanUpper() {
-		assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
-		assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
-		assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
-	}
-
-	/**
-	 * Check lower-case Roman numerals numbers are generated based on the 
-	 * input page number.
-	 */
-    @Test
-	public void testRomanLower() {
-		assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
-		assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
-		assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
-	}
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Test class for the <code>AutoPageNumberUtils</code> helper class.
+ */
+public class AutoPageNumberUtilsTest {
+
+	/**
+	 * Check upper-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaUpper() {
+		assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+		assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+		assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+		assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+		assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+		assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+	}
+
+	/**
+	 * Check lower-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaLower() {
+		assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+		assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+		assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+		assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+		assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+		assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+	}
+
+	/**
+	 * Check upper-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanUpper() {
+		assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+		assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+		assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+	}
+
+	/**
+	 * Check lower-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanLower() {
+		assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+		assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+		assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+	}
+
+}

[22/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index 75b556c..a32d406 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -1,496 +1,496 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
-import javax.xml.namespace.QName;
-import javax.xml.parsers.SAXParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.Stack;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.ElementMappingContentHandler;
-import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Parser for ODF <code>content.xml</code> files.
- */
-public class OpenDocumentContentParser extends AbstractParser {
-    private interface Style {
-    }
-
-    private static class TextStyle implements Style {
-        public boolean italic;
-        public boolean bold;
-        public boolean underlined;
-    }
-
-    private static class ListStyle implements Style {
-        public boolean ordered;
-
-        public String getTag() {
-            return ordered ? "ol" : "ul";
-        }
-    }
-
-    private static final class OpenDocumentElementMappingContentHandler extends
-            ElementMappingContentHandler {
-        private final ContentHandler handler;
-        private final BitSet textNodeStack = new BitSet();
-        private int nodeDepth = 0;
-        private int completelyFiltered = 0;
-        private Stack<String> headingStack = new Stack<String>();
-        private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
-        private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
-        private TextStyle textStyle;
-        private TextStyle lastTextStyle;
-        private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
-        private ListStyle listStyle;
-
-        private OpenDocumentElementMappingContentHandler(ContentHandler handler,
-                                                         Map<QName, TargetElement> mappings) {
-            super(handler, mappings);
-            this.handler = handler;
-        }
-
-        @Override
-        public void characters(char[] ch, int start, int length)
-                throws SAXException {
-            // only forward content of tags from text:-namespace
-            if (completelyFiltered == 0 && nodeDepth > 0
-                    && textNodeStack.get(nodeDepth - 1)) {
-                lazyEndSpan();
-                super.characters(ch, start, length);
-            }
-        }
-
-        // helper for checking tags which need complete filtering
-        // (with sub-tags)
-        private boolean needsCompleteFiltering(
-                String namespaceURI, String localName) {
-            if (TEXT_NS.equals(namespaceURI)) {
-                return localName.endsWith("-template")
-                        || localName.endsWith("-style");
-            }
-            return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
-        }
-
-        // map the heading level to <hX> HTML tags
-        private String getXHTMLHeaderTagName(Attributes atts) {
-            String depthStr = atts.getValue(TEXT_NS, "outline-level");
-            if (depthStr == null) {
-                return "h1";
-            }
-
-            int depth = Integer.parseInt(depthStr);
-            if (depth >= 6) {
-                return "h6";
-            } else if (depth <= 1) {
-                return "h1";
-            } else {
-                return "h" + depth;
-            }
-        }
-
-        /**
-         * Check if a node is a text node
-         */
-        private boolean isTextNode(String namespaceURI, String localName) {
-            if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
-                return true;
-            }
-            if (SVG_NS.equals(namespaceURI)) {
-                return "title".equals(localName) ||
-                        "desc".equals(localName);
-            }
-            return false;
-        }
-
-        private void startList(String name) throws SAXException {
-            String elementName = "ul";
-            if (name != null) {
-                ListStyle style = listStyleMap.get(name);
-                elementName = style != null ? style.getTag() : "ul";
-                listStyleStack.push(style);
-            }
-            handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
-        }
-
-        private void endList() throws SAXException {
-            String elementName = "ul";
-            if (!listStyleStack.isEmpty()) {
-                ListStyle style = listStyleStack.pop();
-                elementName = style != null ? style.getTag() : "ul";
-            }
-            handler.endElement(XHTML, elementName, elementName);
-        }
-
-        private void startSpan(String name) throws SAXException {
-            if (name == null) {
-                return;
-            }
-
-            TextStyle style = textStyleMap.get(name);
-            if (style == null) {
-                return;
-            }
-
-            // End tags that refer to no longer valid styles
-            if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
-                handler.endElement(XHTML, "u", "u");
-            }
-            if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
-                handler.endElement(XHTML, "i", "i");
-            }
-            if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
-                handler.endElement(XHTML, "b", "b");
-            }
-
-            // Start tags for new styles
-            if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
-                handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
-            }
-            if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
-                handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
-            }
-            if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
-                handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
-            }
-
-            textStyle = style;
-            lastTextStyle = null;
-        }
-
-        private void endSpan() throws SAXException {
-            lastTextStyle = textStyle;
-            textStyle = null;
-        }
-
-        private void lazyEndSpan() throws SAXException {
-            if (lastTextStyle == null) {
-                return;
-            }
-
-            if (lastTextStyle.underlined) {
-                handler.endElement(XHTML, "u", "u");
-            }
-            if (lastTextStyle.italic) {
-                handler.endElement(XHTML, "i", "i");
-            }
-            if (lastTextStyle.bold) {
-                handler.endElement(XHTML, "b", "b");
-            }
-
-            lastTextStyle = null;
-        }
-
-        @Override
-        public void startElement(
-                String namespaceURI, String localName, String qName,
-                Attributes attrs) throws SAXException {
-            // keep track of current node type. If it is a text node,
-            // a bit at the current depth its set in textNodeStack.
-            // characters() checks the top bit to determine, if the
-            // actual node is a text node to print out nodeDepth contains
-            // the depth of the current node and also marks top of stack.
-            assert nodeDepth >= 0;
-
-            // Set styles
-            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
-                String family = attrs.getValue(STYLE_NS, "family");
-                if ("text".equals(family)) {
-                    textStyle = new TextStyle();
-                    String name = attrs.getValue(STYLE_NS, "name");
-                    textStyleMap.put(name, textStyle);
-                }
-            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
-                listStyle = new ListStyle();
-                String name = attrs.getValue(STYLE_NS, "name");
-                listStyleMap.put(name, listStyle);
-            } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
-                    && "text-properties".equals(localName)) {
-                String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
-                if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
-                    textStyle.italic = true;
-                }
-                String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
-                if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
-                        || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
-                        && Integer.valueOf(fontWeight) > 500)) {
-                    textStyle.bold = true;
-                }
-                String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
-                if (underlineStyle != null) {
-                    textStyle.underlined = true;
-                }
-            } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
-                if ("list-level-style-bullet".equals(localName)) {
-                    listStyle.ordered = false;
-                } else if ("list-level-style-number".equals(localName)) {
-                    listStyle.ordered = true;
-                }
-            }
-
-            textNodeStack.set(nodeDepth++,
-                    isTextNode(namespaceURI, localName));
-            // filter *all* content of some tags
-            assert completelyFiltered >= 0;
-
-            if (needsCompleteFiltering(namespaceURI, localName)) {
-                completelyFiltered++;
-            }
-            // call next handler if no filtering
-            if (completelyFiltered == 0) {
-                // special handling of text:h, that are directly passed
-                // to incoming handler
-                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
-                    final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
-                    handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
-                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
-                    startList(attrs.getValue(TEXT_NS, "style-name"));
-                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
-                    startSpan(attrs.getValue(TEXT_NS, "style-name"));
-                } else {
-                    super.startElement(namespaceURI, localName, qName, attrs);
-                }
-            }
-        }
-
-        @Override
-        public void endElement(
-                String namespaceURI, String localName, String qName)
-                throws SAXException {
-            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
-                textStyle = null;
-            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
-                listStyle = null;
-            }
-
-            // call next handler if no filtering
-            if (completelyFiltered == 0) {
-                // special handling of text:h, that are directly passed
-                // to incoming handler
-                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
-                    final String el = headingStack.pop();
-                    handler.endElement(XHTMLContentHandler.XHTML, el, el);
-                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
-                    endList();
-                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
-                    endSpan();
-                } else {
-                    if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
-                        lazyEndSpan();
-                    }
-                    super.endElement(namespaceURI, localName, qName);
-                }
-
-                // special handling of tabulators
-                if (TEXT_NS.equals(namespaceURI)
-                        && ("tab-stop".equals(localName)
-                        || "tab".equals(localName))) {
-                    this.characters(TAB, 0, TAB.length);
-                }
-            }
-
-            // revert filter for *all* content of some tags
-            if (needsCompleteFiltering(namespaceURI, localName)) {
-                completelyFiltered--;
-            }
-            assert completelyFiltered >= 0;
-
-            // reduce current node depth
-            nodeDepth--;
-            assert nodeDepth >= 0;
-        }
-
-        @Override
-        public void startPrefixMapping(String prefix, String uri) {
-            // remove prefix mappings as they should not occur in XHTML
-        }
-
-        @Override
-        public void endPrefixMapping(String prefix) {
-            // remove prefix mappings as they should not occur in XHTML
-        }
-    }
-
-    public static final String TEXT_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
-
-    public static final String TABLE_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
-
-    public static final String STYLE_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
-
-    public static final String FORMATTING_OBJECTS_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
-
-    public static final String OFFICE_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
-
-    public static final String SVG_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
-
-    public static final String PRESENTATION_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
-
-    public static final String DRAW_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
-
-    public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
-
-    protected static final char[] TAB = new char[]{'\t'};
-
-    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
-
-    /**
-     * Mappings between ODF tag names and XHTML tag names
-     * (including attributes). All other tag names/attributes are ignored
-     * and left out from event stream.
-     */
-    private static final HashMap<QName, TargetElement> MAPPINGS =
-            new HashMap<QName, TargetElement>();
-
-    static {
-        // general mappings of text:-tags
-        MAPPINGS.put(
-                new QName(TEXT_NS, "p"),
-                new TargetElement(XHTML, "p"));
-        // text:h-tags are mapped specifically in startElement/endElement
-        MAPPINGS.put(
-                new QName(TEXT_NS, "line-break"),
-                new TargetElement(XHTML, "br"));
-        MAPPINGS.put(
-                new QName(TEXT_NS, "list-item"),
-                new TargetElement(XHTML, "li"));
-        MAPPINGS.put(
-                new QName(TEXT_NS, "note"),
-                new TargetElement(XHTML, "div"));
-        MAPPINGS.put(
-                new QName(OFFICE_NS, "annotation"),
-                new TargetElement(XHTML, "div"));
-        MAPPINGS.put(
-                new QName(PRESENTATION_NS, "notes"),
-                new TargetElement(XHTML, "div"));
-        MAPPINGS.put(
-                new QName(DRAW_NS, "object"),
-                new TargetElement(XHTML, "object"));
-        MAPPINGS.put(
-                new QName(DRAW_NS, "text-box"),
-                new TargetElement(XHTML, "div"));
-        MAPPINGS.put(
-                new QName(SVG_NS, "title"),
-                new TargetElement(XHTML, "span"));
-        MAPPINGS.put(
-                new QName(SVG_NS, "desc"),
-                new TargetElement(XHTML, "span"));
-        MAPPINGS.put(
-                new QName(TEXT_NS, "span"),
-                new TargetElement(XHTML, "span"));
-
-        final HashMap<QName, QName> aAttsMapping =
-                new HashMap<QName, QName>();
-        aAttsMapping.put(
-                new QName(XLINK_NS, "href"),
-                new QName("href"));
-        aAttsMapping.put(
-                new QName(XLINK_NS, "title"),
-                new QName("title"));
-        MAPPINGS.put(
-                new QName(TEXT_NS, "a"),
-                new TargetElement(XHTML, "a", aAttsMapping));
-
-        // create HTML tables from table:-tags
-        MAPPINGS.put(
-                new QName(TABLE_NS, "table"),
-                new TargetElement(XHTML, "table"));
-        // repeating of rows is ignored; for columns, see below!
-        MAPPINGS.put(
-                new QName(TABLE_NS, "table-row"),
-                new TargetElement(XHTML, "tr"));
-        // special mapping for rowspan/colspan attributes
-        final HashMap<QName, QName> tableCellAttsMapping =
-                new HashMap<QName, QName>();
-        tableCellAttsMapping.put(
-                new QName(TABLE_NS, "number-columns-spanned"),
-                new QName("colspan"));
-        tableCellAttsMapping.put(
-                new QName(TABLE_NS, "number-rows-spanned"),
-                new QName("rowspan"));
-        /* TODO: The following is not correct, the cell should be repeated not spanned!
-         * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
-         * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
-         * Cell spanning instead of repeating  is not a problem, because OpenOffice uses it
-         * only for empty cells.
-         */
-        tableCellAttsMapping.put(
-                new QName(TABLE_NS, "number-columns-repeated"),
-                new QName("colspan"));
-        MAPPINGS.put(
-                new QName(TABLE_NS, "table-cell"),
-                new TargetElement(XHTML, "td", tableCellAttsMapping));
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return Collections.emptySet(); // not a top-level parser
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        parseInternal(stream,
-                new XHTMLContentHandler(handler, metadata),
-                metadata, context);
-    }
-
-    void parseInternal(
-            InputStream stream, final ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-        DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
-
-
-        SAXParser parser = context.getSAXParser();
-        parser.parse(
-            new CloseShieldInputStream(stream),
-            new OfflineContentHandler(
-                    new NSNormalizerContentHandler(dh)));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import javax.xml.namespace.QName;
+import javax.xml.parsers.SAXParser;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+    private interface Style {
+    }
+
+    private static class TextStyle implements Style {
+        public boolean italic;
+        public boolean bold;
+        public boolean underlined;
+    }
+
+    private static class ListStyle implements Style {
+        public boolean ordered;
+
+        public String getTag() {
+            return ordered ? "ol" : "ul";
+        }
+    }
+
+    private static final class OpenDocumentElementMappingContentHandler extends
+            ElementMappingContentHandler {
+        private final ContentHandler handler;
+        private final BitSet textNodeStack = new BitSet();
+        private int nodeDepth = 0;
+        private int completelyFiltered = 0;
+        private Stack<String> headingStack = new Stack<String>();
+        private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+        private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+        private TextStyle textStyle;
+        private TextStyle lastTextStyle;
+        private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+        private ListStyle listStyle;
+
+        private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+                                                         Map<QName, TargetElement> mappings) {
+            super(handler, mappings);
+            this.handler = handler;
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            // only forward content of tags from text:-namespace
+            if (completelyFiltered == 0 && nodeDepth > 0
+                    && textNodeStack.get(nodeDepth - 1)) {
+                lazyEndSpan();
+                super.characters(ch, start, length);
+            }
+        }
+
+        // helper for checking tags which need complete filtering
+        // (with sub-tags)
+        private boolean needsCompleteFiltering(
+                String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI)) {
+                return localName.endsWith("-template")
+                        || localName.endsWith("-style");
+            }
+            return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+        }
+
+        // map the heading level to <hX> HTML tags
+        private String getXHTMLHeaderTagName(Attributes atts) {
+            String depthStr = atts.getValue(TEXT_NS, "outline-level");
+            if (depthStr == null) {
+                return "h1";
+            }
+
+            int depth = Integer.parseInt(depthStr);
+            if (depth >= 6) {
+                return "h6";
+            } else if (depth <= 1) {
+                return "h1";
+            } else {
+                return "h" + depth;
+            }
+        }
+
+        /**
+         * Check if a node is a text node
+         */
+        private boolean isTextNode(String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+                return true;
+            }
+            if (SVG_NS.equals(namespaceURI)) {
+                return "title".equals(localName) ||
+                        "desc".equals(localName);
+            }
+            return false;
+        }
+
+        private void startList(String name) throws SAXException {
+            String elementName = "ul";
+            if (name != null) {
+                ListStyle style = listStyleMap.get(name);
+                elementName = style != null ? style.getTag() : "ul";
+                listStyleStack.push(style);
+            }
+            handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+        }
+
+        private void endList() throws SAXException {
+            String elementName = "ul";
+            if (!listStyleStack.isEmpty()) {
+                ListStyle style = listStyleStack.pop();
+                elementName = style != null ? style.getTag() : "ul";
+            }
+            handler.endElement(XHTML, elementName, elementName);
+        }
+
+        private void startSpan(String name) throws SAXException {
+            if (name == null) {
+                return;
+            }
+
+            TextStyle style = textStyleMap.get(name);
+            if (style == null) {
+                return;
+            }
+
+            // End tags that refer to no longer valid styles
+            if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            // Start tags for new styles
+            if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+                handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+            }
+            if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+                handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+            }
+            if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+                handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+            }
+
+            textStyle = style;
+            lastTextStyle = null;
+        }
+
+        private void endSpan() throws SAXException {
+            lastTextStyle = textStyle;
+            textStyle = null;
+        }
+
+        private void lazyEndSpan() throws SAXException {
+            if (lastTextStyle == null) {
+                return;
+            }
+
+            if (lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            lastTextStyle = null;
+        }
+
+        @Override
+        public void startElement(
+                String namespaceURI, String localName, String qName,
+                Attributes attrs) throws SAXException {
+            // keep track of current node type. If it is a text node,
+            // a bit at the current depth its set in textNodeStack.
+            // characters() checks the top bit to determine, if the
+            // actual node is a text node to print out nodeDepth contains
+            // the depth of the current node and also marks top of stack.
+            assert nodeDepth >= 0;
+
+            // Set styles
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                String family = attrs.getValue(STYLE_NS, "family");
+                if ("text".equals(family)) {
+                    textStyle = new TextStyle();
+                    String name = attrs.getValue(STYLE_NS, "name");
+                    textStyleMap.put(name, textStyle);
+                }
+            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+                listStyle = new ListStyle();
+                String name = attrs.getValue(STYLE_NS, "name");
+                listStyleMap.put(name, listStyle);
+            } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+                    && "text-properties".equals(localName)) {
+                String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+                if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+                    textStyle.italic = true;
+                }
+                String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+                if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+                        || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+                        && Integer.valueOf(fontWeight) > 500)) {
+                    textStyle.bold = true;
+                }
+                String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+                if (underlineStyle != null) {
+                    textStyle.underlined = true;
+                }
+            } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+                if ("list-level-style-bullet".equals(localName)) {
+                    listStyle.ordered = false;
+                } else if ("list-level-style-number".equals(localName)) {
+                    listStyle.ordered = true;
+                }
+            }
+
+            textNodeStack.set(nodeDepth++,
+                    isTextNode(namespaceURI, localName));
+            // filter *all* content of some tags
+            assert completelyFiltered >= 0;
+
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered++;
+            }
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+                    handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+                    startList(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    startSpan(attrs.getValue(TEXT_NS, "style-name"));
+                } else {
+                    super.startElement(namespaceURI, localName, qName, attrs);
+                }
+            }
+        }
+
+        @Override
+        public void endElement(
+                String namespaceURI, String localName, String qName)
+                throws SAXException {
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                textStyle = null;
+            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+                listStyle = null;
+            }
+
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = headingStack.pop();
+                    handler.endElement(XHTMLContentHandler.XHTML, el, el);
+                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+                    endList();
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    endSpan();
+                } else {
+                    if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+                        lazyEndSpan();
+                    }
+                    super.endElement(namespaceURI, localName, qName);
+                }
+
+                // special handling of tabulators
+                if (TEXT_NS.equals(namespaceURI)
+                        && ("tab-stop".equals(localName)
+                        || "tab".equals(localName))) {
+                    this.characters(TAB, 0, TAB.length);
+                }
+            }
+
+            // revert filter for *all* content of some tags
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered--;
+            }
+            assert completelyFiltered >= 0;
+
+            // reduce current node depth
+            nodeDepth--;
+            assert nodeDepth >= 0;
+        }
+
+        @Override
+        public void startPrefixMapping(String prefix, String uri) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+
+        @Override
+        public void endPrefixMapping(String prefix) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+    }
+
+    public static final String TEXT_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+    public static final String TABLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+    public static final String STYLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+    public static final String FORMATTING_OBJECTS_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+    public static final String OFFICE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+    public static final String SVG_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+    public static final String PRESENTATION_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+    public static final String DRAW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+    public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+    protected static final char[] TAB = new char[]{'\t'};
+
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+    /**
+     * Mappings between ODF tag names and XHTML tag names
+     * (including attributes). All other tag names/attributes are ignored
+     * and left out from event stream.
+     */
+    private static final HashMap<QName, TargetElement> MAPPINGS =
+            new HashMap<QName, TargetElement>();
+
+    static {
+        // general mappings of text:-tags
+        MAPPINGS.put(
+                new QName(TEXT_NS, "p"),
+                new TargetElement(XHTML, "p"));
+        // text:h-tags are mapped specifically in startElement/endElement
+        MAPPINGS.put(
+                new QName(TEXT_NS, "line-break"),
+                new TargetElement(XHTML, "br"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "list-item"),
+                new TargetElement(XHTML, "li"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "note"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(OFFICE_NS, "annotation"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(PRESENTATION_NS, "notes"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "object"),
+                new TargetElement(XHTML, "object"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "text-box"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "title"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "desc"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "span"),
+                new TargetElement(XHTML, "span"));
+
+        final HashMap<QName, QName> aAttsMapping =
+                new HashMap<QName, QName>();
+        aAttsMapping.put(
+                new QName(XLINK_NS, "href"),
+                new QName("href"));
+        aAttsMapping.put(
+                new QName(XLINK_NS, "title"),
+                new QName("title"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
+
+        // create HTML tables from table:-tags
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table"),
+                new TargetElement(XHTML, "table"));
+        // repeating of rows is ignored; for columns, see below!
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-row"),
+                new TargetElement(XHTML, "tr"));
+        // special mapping for rowspan/colspan attributes
+        final HashMap<QName, QName> tableCellAttsMapping =
+                new HashMap<QName, QName>();
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-spanned"),
+                new QName("colspan"));
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-rows-spanned"),
+                new QName("rowspan"));
+        /* TODO: The following is not correct, the cell should be repeated not spanned!
+         * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+         * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+         * Cell spanning instead of repeating  is not a problem, because OpenOffice uses it
+         * only for empty cells.
+         */
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-repeated"),
+                new QName("colspan"));
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-cell"),
+                new TargetElement(XHTML, "td", tableCellAttsMapping));
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.emptySet(); // not a top-level parser
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        parseInternal(stream,
+                new XHTMLContentHandler(handler, metadata),
+                metadata, context);
+    }
+
+    void parseInternal(
+            InputStream stream, final ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+
+        SAXParser parser = context.getSAXParser();
+        parser.parse(
+            new CloseShieldInputStream(stream),
+            new OfflineContentHandler(
+                    new NSNormalizerContentHandler(dh)));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 4713022..14b9674 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -1,199 +1,199 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.MSOffice;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
-import org.apache.tika.parser.xml.AttributeMetadataHandler;
-import org.apache.tika.parser.xml.ElementMetadataHandler;
-import org.apache.tika.parser.xml.MetadataHandler;
-import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.xpath.CompositeMatcher;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for OpenDocument <code>meta.xml</code> files.
- */
-public class OpenDocumentMetaParser extends XMLParser {
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -8739250869531737584L;
-
-    private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
-    private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
-
-    /**
-     * @see OfficeOpenXMLCore#SUBJECT
-     * @deprecated use OfficeOpenXMLCore#SUBJECT
-     */
-    @Deprecated
-    private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
-            Property.composite(Office.INITIAL_AUTHOR,
-                    new Property[]{Property.externalText("initial-creator")});
-
-    private static ContentHandler getDublinCoreHandler(
-            Metadata metadata, Property property, String element) {
-        return new ElementMetadataHandler(
-                DublinCore.NAMESPACE_URI_DC, element,
-                metadata, property);
-    }
-
-    private static ContentHandler getMeta(
-            ContentHandler ch, Metadata md, Property property, String element) {
-        Matcher matcher = new CompositeMatcher(
-                META_XPATH.parse("//meta:" + element),
-                META_XPATH.parse("//meta:" + element + "//text()"));
-        ContentHandler branch =
-                new MatchingContentHandler(new MetadataHandler(md, property), matcher);
-        return new TeeContentHandler(ch, branch);
-    }
-
-    private static ContentHandler getUserDefined(
-            ContentHandler ch, Metadata md) {
-        Matcher matcher = new CompositeMatcher(
-                META_XPATH.parse("//meta:user-defined/@meta:name"),
-                META_XPATH.parse("//meta:user-defined//text()"));
-        // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
-        ContentHandler branch = new MatchingContentHandler(
-                new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
-                matcher);
-        return new TeeContentHandler(ch, branch);
-    }
-
-    @Deprecated
-    private static ContentHandler getStatistic(
-            ContentHandler ch, Metadata md, String name, String attribute) {
-        Matcher matcher =
-                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
-        ContentHandler branch = new MatchingContentHandler(
-                new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
-        return new TeeContentHandler(ch, branch);
-    }
-
-    private static ContentHandler getStatistic(
-            ContentHandler ch, Metadata md, Property property, String attribute) {
-        Matcher matcher =
-                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
-        ContentHandler branch = new MatchingContentHandler(
-                new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
-        return new TeeContentHandler(ch, branch);
-    }
-
-    protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
-        // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
-        // Process the Dublin Core Attributes 
-        ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
-                getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
-                getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
-                getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
-                getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
-                getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
-                getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
-                getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
-                getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
-                getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
-                getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
-
-        // Process the OO Meta Attributes
-        ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
-        // ODF uses dc:date for modified
-        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
-                DublinCore.NAMESPACE_URI_DC, "date",
-                md, TikaCoreProperties.MODIFIED));
-
-        // ODF uses dc:subject for description
-        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
-                DublinCore.NAMESPACE_URI_DC, "subject",
-                md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
-        ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
-
-        ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
-        ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
-        ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
-        ch = getMeta(ch, md, Property.externalText("generator"), "generator");
-
-        // Process the user defined Meta Attributes
-        ch = getUserDefined(ch, md);
-
-        // Process the OO Statistics Attributes
-        ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
-        ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
-        ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
-        ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
-        ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
-        ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
-        ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
-        ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
-
-        // Legacy, Tika-1.0 style attributes
-        // TODO Remove these in Tika 2.0
-        ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
-        ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
-        ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
-        ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
-        ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
-        ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
-        ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
-
-        // Legacy Statistics Attributes, replaced with real keys above
-        // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
-        ch = getStatistic(ch, md, "nbPage", "page-count");
-        ch = getStatistic(ch, md, "nbPara", "paragraph-count");
-        ch = getStatistic(ch, md, "nbWord", "word-count");
-        ch = getStatistic(ch, md, "nbCharacter", "character-count");
-        ch = getStatistic(ch, md, "nbTab", "table-count");
-        ch = getStatistic(ch, md, "nbObject", "object-count");
-        ch = getStatistic(ch, md, "nbImg", "image-count");
-
-        // Normalise the rest
-        ch = new NSNormalizerContentHandler(ch);
-        return ch;
-    }
-
-    @Override
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        super.parse(stream, handler, metadata, context);
-        // Copy subject to description for OO2
-        String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
-        if (odfSubject != null && !odfSubject.equals("") &&
-                (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
-            metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -8739250869531737584L;
+
+    private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+    private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+    /**
+     * @see OfficeOpenXMLCore#SUBJECT
+     * @deprecated use OfficeOpenXMLCore#SUBJECT
+     */
+    @Deprecated
+    private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+            Property.composite(Office.INITIAL_AUTHOR,
+                    new Property[]{Property.externalText("initial-creator")});
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    private static ContentHandler getMeta(
+            ContentHandler ch, Metadata md, Property property, String element) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:" + element),
+                META_XPATH.parse("//meta:" + element + "//text()"));
+        ContentHandler branch =
+                new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getUserDefined(
+            ContentHandler ch, Metadata md) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:user-defined/@meta:name"),
+                META_XPATH.parse("//meta:user-defined//text()"));
+        // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+                matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    @Deprecated
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, String name, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, Property property, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+        // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+        // Process the Dublin Core Attributes 
+        ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+                getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+        // Process the OO Meta Attributes
+        ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+        // ODF uses dc:date for modified
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "date",
+                md, TikaCoreProperties.MODIFIED));
+
+        // ODF uses dc:subject for description
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "subject",
+                md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+        ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+        ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+        ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+        ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+        ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+        // Process the user defined Meta Attributes
+        ch = getUserDefined(ch, md);
+
+        // Process the OO Statistics Attributes
+        ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+        ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+        // Legacy, Tika-1.0 style attributes
+        // TODO Remove these in Tika 2.0
+        ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+        // Legacy Statistics Attributes, replaced with real keys above
+        // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+        ch = getStatistic(ch, md, "nbPage", "page-count");
+        ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+        ch = getStatistic(ch, md, "nbWord", "word-count");
+        ch = getStatistic(ch, md, "nbCharacter", "character-count");
+        ch = getStatistic(ch, md, "nbTab", "table-count");
+        ch = getStatistic(ch, md, "nbObject", "object-count");
+        ch = getStatistic(ch, md, "nbImg", "image-count");
+
+        // Normalise the rest
+        ch = new NSNormalizerContentHandler(ch);
+        return ch;
+    }
+
+    @Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        super.parse(stream, handler, metadata, context);
+        // Copy subject to description for OO2
+        String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+        if (odfSubject != null && !odfSubject.equals("") &&
+                (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+            metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 2739340..00145d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -1,225 +1,225 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.EndDocumentShieldingContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * OpenOffice parser
- */
-public class OpenDocumentParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -6410276875438618287L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.application("vnd.sun.xml.writer"),
-                    MediaType.application("vnd.oasis.opendocument.text"),
-                    MediaType.application("vnd.oasis.opendocument.graphics"),
-                    MediaType.application("vnd.oasis.opendocument.presentation"),
-                    MediaType.application("vnd.oasis.opendocument.spreadsheet"),
-                    MediaType.application("vnd.oasis.opendocument.chart"),
-                    MediaType.application("vnd.oasis.opendocument.image"),
-                    MediaType.application("vnd.oasis.opendocument.formula"),
-                    MediaType.application("vnd.oasis.opendocument.text-master"),
-                    MediaType.application("vnd.oasis.opendocument.text-web"),
-                    MediaType.application("vnd.oasis.opendocument.text-template"),
-                    MediaType.application("vnd.oasis.opendocument.graphics-template"),
-                    MediaType.application("vnd.oasis.opendocument.presentation-template"),
-                    MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
-                    MediaType.application("vnd.oasis.opendocument.chart-template"),
-                    MediaType.application("vnd.oasis.opendocument.image-template"),
-                    MediaType.application("vnd.oasis.opendocument.formula-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.text"),
-                    MediaType.application("x-vnd.oasis.opendocument.graphics"),
-                    MediaType.application("x-vnd.oasis.opendocument.presentation"),
-                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
-                    MediaType.application("x-vnd.oasis.opendocument.chart"),
-                    MediaType.application("x-vnd.oasis.opendocument.image"),
-                    MediaType.application("x-vnd.oasis.opendocument.formula"),
-                    MediaType.application("x-vnd.oasis.opendocument.text-master"),
-                    MediaType.application("x-vnd.oasis.opendocument.text-web"),
-                    MediaType.application("x-vnd.oasis.opendocument.text-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.chart-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.image-template"),
-                    MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
-
-    private static final String META_NAME = "meta.xml";
-
-    private Parser meta = new OpenDocumentMetaParser();
-
-    private Parser content = new OpenDocumentContentParser();
-
-    public Parser getMetaParser() {
-        return meta;
-    }
-
-    public void setMetaParser(Parser meta) {
-        this.meta = meta;
-    }
-
-    public Parser getContentParser() {
-        return content;
-    }
-
-    public void setContentParser(Parser content) {
-        this.content = content;
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler baseHandler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-        // Open the Zip stream
-        // Use a File if we can, and an already open zip is even better
-        ZipFile zipFile = null;
-        ZipInputStream zipStream = null;
-        if (stream instanceof TikaInputStream) {
-            TikaInputStream tis = (TikaInputStream) stream;
-            Object container = ((TikaInputStream) stream).getOpenContainer();
-            if (container instanceof ZipFile) {
-                zipFile = (ZipFile) container;
-            } else if (tis.hasFile()) {
-                zipFile = new ZipFile(tis.getFile());
-            } else {
-                zipStream = new ZipInputStream(stream);
-            }
-        } else {
-            zipStream = new ZipInputStream(stream);
-        }
-
-        // Prepare to handle the content
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
-
-        // As we don't know which of the metadata or the content
-        //  we'll hit first, catch the endDocument call initially
-        EndDocumentShieldingContentHandler handler =
-                new EndDocumentShieldingContentHandler(xhtml);
-
-        if (zipFile != null) {
-            try {
-                handleZipFile(zipFile, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipFile.close();
-            }
-        } else {
-            try {
-                handleZipStream(zipStream, metadata, context, handler);
-            } finally {
-                //Do we want to close silently == catch an exception here?
-                zipStream.close();
-            }
-        }
-
-        // Only now call the end document
-        if (handler.getEndDocumentWasCalled()) {
-            handler.reallyEndDocument();
-        }
-    }
-
-    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
-        ZipEntry entry = zipStream.getNextEntry();
-        while (entry != null) {
-            handleZipEntry(entry, zipStream, metadata, context, handler);
-            entry = zipStream.getNextEntry();
-        }
-    }
-
-    private void handleZipFile(ZipFile zipFile, Metadata metadata,
-                               ParseContext context, EndDocumentShieldingContentHandler handler)
-            throws IOException, TikaException, SAXException {
-        // If we can, process the metadata first, then the
-        //  rest of the file afterwards (TIKA-1353)
-        // Only possible to guarantee that when opened from a file not a stream
-
-        ZipEntry entry = zipFile.getEntry(META_NAME);
-        if (entry != null) {
-            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
-        }
-
-        Enumeration<? extends ZipEntry> entries = zipFile.entries();
-        while (entries.hasMoreElements()) {
-            entry = entries.nextElement();
-            if (!META_NAME.equals(entry.getName())) {
-                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
-            }
-        }
-    }
-    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
-                                ParseContext context, EndDocumentShieldingContentHandler handler)
-            throws IOException, SAXException, TikaException {
-        if (entry == null) return;
-
-        if (entry.getName().equals("mimetype")) {
-            String type = IOUtils.toString(zip, UTF_8);
-            metadata.set(Metadata.CONTENT_TYPE, type);
-        } else if (entry.getName().equals(META_NAME)) {
-            meta.parse(zip, new DefaultHandler(), metadata, context);
-        } else if (entry.getName().endsWith("content.xml")) {
-            if (content instanceof OpenDocumentContentParser) {
-                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
-            } else {
-                // Foreign content parser was set:
-                content.parse(zip, handler, metadata, context);
-            }
-        } else if (entry.getName().endsWith("styles.xml")) {
-            if (content instanceof OpenDocumentContentParser) {
-                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
-            } else {
-                // Foreign content parser was set:
-                content.parse(zip, handler, metadata, context);
-            }
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -6410276875438618287L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.sun.xml.writer"),
+                    MediaType.application("vnd.oasis.opendocument.text"),
+                    MediaType.application("vnd.oasis.opendocument.graphics"),
+                    MediaType.application("vnd.oasis.opendocument.presentation"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("vnd.oasis.opendocument.chart"),
+                    MediaType.application("vnd.oasis.opendocument.image"),
+                    MediaType.application("vnd.oasis.opendocument.formula"),
+                    MediaType.application("vnd.oasis.opendocument.text-master"),
+                    MediaType.application("vnd.oasis.opendocument.text-web"),
+                    MediaType.application("vnd.oasis.opendocument.text-template"),
+                    MediaType.application("vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("vnd.oasis.opendocument.image-template"),
+                    MediaType.application("vnd.oasis.opendocument.formula-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.text"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart"),
+                    MediaType.application("x-vnd.oasis.opendocument.image"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-master"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-web"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.image-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+    private static final String META_NAME = "meta.xml";
+
+    private Parser meta = new OpenDocumentMetaParser();
+
+    private Parser content = new OpenDocumentContentParser();
+
+    public Parser getMetaParser() {
+        return meta;
+    }
+
+    public void setMetaParser(Parser meta) {
+        this.meta = meta;
+    }
+
+    public Parser getContentParser() {
+        return content;
+    }
+
+    public void setContentParser(Parser content) {
+        this.content = content;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler baseHandler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Open the Zip stream
+        // Use a File if we can, and an already open zip is even better
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());
+            } else {
+                zipStream = new ZipInputStream(stream);
+            }
+        } else {
+            zipStream = new ZipInputStream(stream);
+        }
+
+        // Prepare to handle the content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+        // As we don't know which of the metadata or the content
+        //  we'll hit first, catch the endDocument call initially
+        EndDocumentShieldingContentHandler handler =
+                new EndDocumentShieldingContentHandler(xhtml);
+
+        if (zipFile != null) {
+            try {
+                handleZipFile(zipFile, metadata, context, handler);
+            } finally {
+                //Do we want to close silently == catch an exception here?
+                zipFile.close();
+            }
+        } else {
+            try {
+                handleZipStream(zipStream, metadata, context, handler);
+            } finally {
+                //Do we want to close silently == catch an exception here?
+                zipStream.close();
+            }
+        }
+
+        // Only now call the end document
+        if (handler.getEndDocumentWasCalled()) {
+            handler.reallyEndDocument();
+        }
+    }
+
+    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+        ZipEntry entry = zipStream.getNextEntry();
+        while (entry != null) {
+            handleZipEntry(entry, zipStream, metadata, context, handler);
+            entry = zipStream.getNextEntry();
+        }
+    }
+
+    private void handleZipFile(ZipFile zipFile, Metadata metadata,
+                               ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, TikaException, SAXException {
+        // If we can, process the metadata first, then the
+        //  rest of the file afterwards (TIKA-1353)
+        // Only possible to guarantee that when opened from a file not a stream
+
+        ZipEntry entry = zipFile.getEntry(META_NAME);
+        if (entry != null) {
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+        }
+
+        Enumeration<? extends ZipEntry> entries = zipFile.entries();
+        while (entries.hasMoreElements()) {
+            entry = entries.nextElement();
+            if (!META_NAME.equals(entry.getName())) {
+                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+            }
+        }
+    }
+    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+                                ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        if (entry == null) return;
+
+        if (entry.getName().equals("mimetype")) {
+            String type = IOUtils.toString(zip, UTF_8);
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        } else if (entry.getName().equals(META_NAME)) {
+            meta.parse(zip, new DefaultHandler(), metadata, context);
+        } else if (entry.getName().endsWith("content.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        } else if (entry.getName().endsWith("styles.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        }
+    }
+}

[03/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
index 45f0388..da046aa 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.Locale;
-
-/**
- * Alternative HTML mapping rules that pass the input HTML as-is without any
- * modifications.
- *
- * @since Apache Tika 0.8
- */
-public class IdentityHtmlMapper implements HtmlMapper {
-
-    public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
-
-    public boolean isDiscardElement(String name) {
-        return false;
-    }
-
-    public String mapSafeAttribute(String elementName, String attributeName) {
-        return attributeName.toLowerCase(Locale.ENGLISH);
-    }
-
-    public String mapSafeElement(String name) {
-        return name.toLowerCase(Locale.ENGLISH);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.Locale;
+
+/**
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
+ * @since Apache Tika 0.8
+ */
+public class IdentityHtmlMapper implements HtmlMapper {
+
+    public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
+
+    public boolean isDiscardElement(String name) {
+        return false;
+    }
+
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return attributeName.toLowerCase(Locale.ENGLISH);
+    }
+
+    public String mapSafeElement(String name) {
+        return name.toLowerCase(Locale.ENGLISH);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
index 336ae75..221a87a 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import javax.xml.XMLConstants;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that downgrades XHTML elements to
- * old-style HTML elements before passing them on to the decorated
- * content handler. This downgrading consists of dropping all namespaces
- * (and namespaced attributes) and uppercasing all element names.
- * Used by the {@link HtmlParser} to make all incoming HTML look the same.
- */
-class XHTMLDowngradeHandler extends ContentHandlerDecorator {
-
-    public XHTMLDowngradeHandler(ContentHandler handler) {
-        super(handler);
-    }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String name, Attributes atts)
-            throws SAXException {
-        String upper = localName.toUpperCase(Locale.ENGLISH);
-
-        AttributesImpl attributes = new AttributesImpl();
-        for (int i = 0; i < atts.getLength(); i++) {
-            String auri = atts.getURI(i);
-            String local = atts.getLocalName(i);
-            String qname = atts.getQName(i);
-            if (XMLConstants.NULL_NS_URI.equals(auri)
-                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
-                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
-                attributes.addAttribute(
-                        auri, local, qname, atts.getType(i), atts.getValue(i));
-            }
-        }
-
-        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String name)
-            throws SAXException {
-        String upper = localName.toUpperCase(Locale.ENGLISH);
-        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) {
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+    public XHTMLDowngradeHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes atts)
+            throws SAXException {
+        String upper = localName.toUpperCase(Locale.ENGLISH);
+
+        AttributesImpl attributes = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            String auri = atts.getURI(i);
+            String local = atts.getLocalName(i);
+            String qname = atts.getQName(i);
+            if (XMLConstants.NULL_NS_URI.equals(auri)
+                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+                attributes.addAttribute(
+                        auri, local, qname, atts.getType(i), atts.getValue(i));
+            }
+        }
+
+        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name)
+            throws SAXException {
+        String upper = localName.toUpperCase(Locale.ENGLISH);
+        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) {
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 9740eff..2c8942e 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -1,376 +1,376 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.codec.DecodeMonitor;
-import org.apache.james.mime4j.codec.DecoderUtil;
-import org.apache.james.mime4j.dom.address.Address;
-import org.apache.james.mime4j.dom.address.AddressList;
-import org.apache.james.mime4j.dom.address.Mailbox;
-import org.apache.james.mime4j.dom.address.MailboxList;
-import org.apache.james.mime4j.dom.field.AddressListField;
-import org.apache.james.mime4j.dom.field.DateTimeField;
-import org.apache.james.mime4j.dom.field.MailboxListField;
-import org.apache.james.mime4j.dom.field.ParsedField;
-import org.apache.james.mime4j.dom.field.UnstructuredField;
-import org.apache.james.mime4j.field.LenientFieldParser;
-import org.apache.james.mime4j.parser.ContentHandler;
-import org.apache.james.mime4j.stream.BodyDescriptor;
-import org.apache.james.mime4j.stream.Field;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
-/**
- * Bridge between mime4j's content handler and the generic Sax content handler
- * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
- */
-class MailContentHandler implements ContentHandler {
-
-    //TIKA-1970 Mac Mail's format
-    private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
-            Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
-
-    //find a time ending in am/pm without a space: 10:30am and
-    //use this pattern to insert space: 10:30 am
-    private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
-
-    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
-            //note that the string is "cleaned" before processing:
-            //1) condense multiple whitespace to single space
-            //2) trim()
-            //3) strip out commas
-            //4) insert space before am/pm
-
-            //May 16 2016 1:32am
-            createDateFormat("MMM dd yy hh:mm a", null),
-
-            //this is a standard pattern handled by mime4j;
-            //but mime4j fails with leading whitespace
-            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
-
-            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
-
-            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
-
-            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
-
-            //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
-            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
-
-            createDateFormat("yy-MM-dd HH:mm:ss", null),
-
-            createDateFormat("MM/dd/yy hh:mm a", null, false),
-
-            //now dates without times
-            createDateFormat("MMM d yy", MIDDAY, false),
-            createDateFormat("EEE d MMM yy", MIDDAY, false),
-            createDateFormat("d MMM yy", MIDDAY, false),
-            createDateFormat("yy/MM/dd", MIDDAY, false),
-            createDateFormat("MM/dd/yy", MIDDAY, false)
-    };
-
-    private static DateFormat createDateFormat(String format, TimeZone timezone) {
-        return createDateFormat(format, timezone, true);
-    }
-
-    private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
-        SimpleDateFormat sdf =
-                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
-        if (timezone != null) {
-            sdf.setTimeZone(timezone);
-        }
-        sdf.setLenient(isLenient);
-        return sdf;
-    }
-
-    private boolean strictParsing = false;
-
-    private XHTMLContentHandler handler;
-    private Metadata metadata;
-    private EmbeddedDocumentExtractor extractor;
-
-    private boolean inPart = false;
-
-    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
-        this.handler = xhtml;
-        this.metadata = metadata;
-        this.strictParsing = strictParsing;
-
-        // Fetch / Build an EmbeddedDocumentExtractor with which
-        //  to handle/process the parts/attachments
-
-        // Was an EmbeddedDocumentExtractor explicitly supplied?
-        this.extractor = context.get(EmbeddedDocumentExtractor.class);
-
-        // If there's no EmbeddedDocumentExtractor, then try using a normal parser
-        // This will ensure that the contents are made available to the user, so
-        //  the see the text, but without fine-grained control/extraction
-        // (This also maintains backward compatibility with older versions!)
-        if (this.extractor == null) {
-            // If the user gave a parser, use that, if not the default
-            Parser parser = context.get(AutoDetectParser.class);
-            if (parser == null) {
-                parser = context.get(Parser.class);
-            }
-            if (parser == null) {
-                TikaConfig tikaConfig = context.get(TikaConfig.class);
-                if (tikaConfig == null) {
-                    tikaConfig = TikaConfig.getDefaultConfig();
-                }
-                parser = new AutoDetectParser(tikaConfig.getParser());
-            }
-            ParseContext ctx = new ParseContext();
-            ctx.set(Parser.class, parser);
-            extractor = new ParsingEmbeddedDocumentExtractor(ctx);
-        }
-    }
-
-    public void body(BodyDescriptor body, InputStream is) throws MimeException,
-            IOException {
-        // use a different metadata object
-        // in order to specify the mime type of the
-        // sub part without damaging the main metadata
-
-        Metadata submd = new Metadata();
-        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
-        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
-
-        try {
-            if (extractor.shouldParseEmbedded(submd)) {
-                extractor.parseEmbedded(is, handler, submd, false);
-            }
-        } catch (SAXException e) {
-            throw new MimeException(e);
-        }
-    }
-
-    public void endBodyPart() throws MimeException {
-        try {
-            handler.endElement("p");
-            handler.endElement("div");
-        } catch (SAXException e) {
-            throw new MimeException(e);
-        }
-    }
-
-    public void endHeader() throws MimeException {
-    }
-
-    public void startMessage() throws MimeException {
-        try {
-            handler.startDocument();
-        } catch (SAXException e) {
-            throw new MimeException(e);
-        }
-    }
-
-    public void endMessage() throws MimeException {
-        try {
-            handler.endDocument();
-        } catch (SAXException e) {
-            throw new MimeException(e);
-        }
-    }
-
-    public void endMultipart() throws MimeException {
-        inPart = false;
-    }
-
-    public void epilogue(InputStream is) throws MimeException, IOException {
-    }
-
-    /**
-     * Header for the whole message or its parts
-     *
-     * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
-     *     http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
-     * Field.html
-     */
-    public void field(Field field) throws MimeException {
-        // inPart indicates whether these metadata correspond to the
-        // whole message or its parts
-        if (inPart) {
-            return;
-        }
-
-        try {
-            String fieldname = field.getName();
-            ParsedField parsedField = LenientFieldParser.getParser().parse(
-                    field, DecodeMonitor.SILENT);
-            if (fieldname.equalsIgnoreCase("From")) {
-                MailboxListField fromField = (MailboxListField) parsedField;
-                MailboxList mailboxList = fromField.getMailboxList();
-                if (fromField.isValidField() && mailboxList != null) {
-                    for (Address address : mailboxList) {
-                        String from = getDisplayString(address);
-                        metadata.add(Metadata.MESSAGE_FROM, from);
-                        metadata.add(TikaCoreProperties.CREATOR, from);
-                    }
-                } else {
-                    String from = stripOutFieldPrefix(field, "From:");
-                    if (from.startsWith("<")) {
-                        from = from.substring(1);
-                    }
-                    if (from.endsWith(">")) {
-                        from = from.substring(0, from.length() - 1);
-                    }
-                    metadata.add(Metadata.MESSAGE_FROM, from);
-                    metadata.add(TikaCoreProperties.CREATOR, from);
-                }
-            } else if (fieldname.equalsIgnoreCase("Subject")) {
-                metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
-                        ((UnstructuredField) parsedField).getValue());
-            } else if (fieldname.equalsIgnoreCase("To")) {
-                processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
-            } else if (fieldname.equalsIgnoreCase("CC")) {
-                processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
-            } else if (fieldname.equalsIgnoreCase("BCC")) {
-                processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
-            } else if (fieldname.equalsIgnoreCase("Date")) {
-                DateTimeField dateField = (DateTimeField) parsedField;
-                Date date = dateField.getDate();
-                if (date == null) {
-                    date = tryOtherDateFormats(field.getBody());
-                }
-                metadata.set(TikaCoreProperties.CREATED, date);
-            }
-        } catch (RuntimeException me) {
-            if (strictParsing) {
-                throw me;
-            }
-        }
-    }
-
-    private static synchronized Date tryOtherDateFormats(String text) {
-        if (text == null) {
-            return null;
-        }
-        //strip out additional spaces and trim
-        text = text.replaceAll("\\s+", " ").trim();
-
-        //strip out commas
-        text = text.replaceAll(",", "");
-        Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
-        if (matcher.find()) {
-            text = matcher.replaceFirst("GMT$1$2:00");
-        }
-
-        matcher = AM_PM.matcher(text);
-        if (matcher.find()) {
-            text = matcher.replaceFirst("$1 $2");
-        }
-
-        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
-            try {
-                return format.parse(text);
-            } catch (ParseException e) {
-            }
-        }
-        return null;
-    }
-
-    private void processAddressList(ParsedField field, String addressListType,
-                                    String metadataField) throws MimeException {
-        AddressListField toField = (AddressListField) field;
-        if (toField.isValidField()) {
-            AddressList addressList = toField.getAddressList();
-            for (Address address : addressList) {
-                metadata.add(metadataField, getDisplayString(address));
-            }
-        } else {
-            String to = stripOutFieldPrefix(field,
-                    addressListType);
-            for (String eachTo : to.split(",")) {
-                metadata.add(metadataField, eachTo.trim());
-            }
-        }
-    }
-
-    private String getDisplayString(Address address) {
-        if (address instanceof Mailbox) {
-            Mailbox mailbox = (Mailbox) address;
-            String name = mailbox.getName();
-            if (name != null && name.length() > 0) {
-                name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
-                return name + " <" + mailbox.getAddress() + ">";
-            } else {
-                return mailbox.getAddress();
-            }
-        } else {
-            return address.toString();
-        }
-    }
-
-    public void preamble(InputStream is) throws MimeException, IOException {
-    }
-
-    public void raw(InputStream is) throws MimeException, IOException {
-    }
-
-    public void startBodyPart() throws MimeException {
-        try {
-            handler.startElement("div", "class", "email-entry");
-            handler.startElement("p");
-        } catch (SAXException e) {
-            throw new MimeException(e);
-        }
-    }
-
-    public void startHeader() throws MimeException {
-        // TODO Auto-generated method stub
-
-    }
-
-    public void startMultipart(BodyDescriptor descr) throws MimeException {
-        inPart = true;
-    }
-
-    private String stripOutFieldPrefix(Field field, String fieldname) {
-        String temp = field.getRaw().toString();
-        int loc = fieldname.length();
-        while (temp.charAt(loc) == ' ') {
-            loc++;
-        }
-        return temp.substring(loc);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.parser.ContentHandler;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
+ */
+class MailContentHandler implements ContentHandler {
+
+    //TIKA-1970 Mac Mail's format
+    private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
+            Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+
+    //find a time ending in am/pm without a space: 10:30am and
+    //use this pattern to insert space: 10:30 am
+    private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
+    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
+            //note that the string is "cleaned" before processing:
+            //1) condense multiple whitespace to single space
+            //2) trim()
+            //3) strip out commas
+            //4) insert space before am/pm
+
+            //May 16 2016 1:32am
+            createDateFormat("MMM dd yy hh:mm a", null),
+
+            //this is a standard pattern handled by mime4j;
+            //but mime4j fails with leading whitespace
+            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+
+            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
+
+            //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
+            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+
+            createDateFormat("yy-MM-dd HH:mm:ss", null),
+
+            createDateFormat("MM/dd/yy hh:mm a", null, false),
+
+            //now dates without times
+            createDateFormat("MMM d yy", MIDDAY, false),
+            createDateFormat("EEE d MMM yy", MIDDAY, false),
+            createDateFormat("d MMM yy", MIDDAY, false),
+            createDateFormat("yy/MM/dd", MIDDAY, false),
+            createDateFormat("MM/dd/yy", MIDDAY, false)
+    };
+
+    private static DateFormat createDateFormat(String format, TimeZone timezone) {
+        return createDateFormat(format, timezone, true);
+    }
+
+    private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
+        SimpleDateFormat sdf =
+                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+        if (timezone != null) {
+            sdf.setTimeZone(timezone);
+        }
+        sdf.setLenient(isLenient);
+        return sdf;
+    }
+
+    private boolean strictParsing = false;
+
+    private XHTMLContentHandler handler;
+    private Metadata metadata;
+    private EmbeddedDocumentExtractor extractor;
+
+    private boolean inPart = false;
+
+    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
+        this.handler = xhtml;
+        this.metadata = metadata;
+        this.strictParsing = strictParsing;
+
+        // Fetch / Build an EmbeddedDocumentExtractor with which
+        //  to handle/process the parts/attachments
+
+        // Was an EmbeddedDocumentExtractor explicitly supplied?
+        this.extractor = context.get(EmbeddedDocumentExtractor.class);
+
+        // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+        // This will ensure that the contents are made available to the user, so
+        //  the see the text, but without fine-grained control/extraction
+        // (This also maintains backward compatibility with older versions!)
+        if (this.extractor == null) {
+            // If the user gave a parser, use that, if not the default
+            Parser parser = context.get(AutoDetectParser.class);
+            if (parser == null) {
+                parser = context.get(Parser.class);
+            }
+            if (parser == null) {
+                TikaConfig tikaConfig = context.get(TikaConfig.class);
+                if (tikaConfig == null) {
+                    tikaConfig = TikaConfig.getDefaultConfig();
+                }
+                parser = new AutoDetectParser(tikaConfig.getParser());
+            }
+            ParseContext ctx = new ParseContext();
+            ctx.set(Parser.class, parser);
+            extractor = new ParsingEmbeddedDocumentExtractor(ctx);
+        }
+    }
+
+    public void body(BodyDescriptor body, InputStream is) throws MimeException,
+            IOException {
+        // use a different metadata object
+        // in order to specify the mime type of the
+        // sub part without damaging the main metadata
+
+        Metadata submd = new Metadata();
+        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+        try {
+            if (extractor.shouldParseEmbedded(submd)) {
+                extractor.parseEmbedded(is, handler, submd, false);
+            }
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endBodyPart() throws MimeException {
+        try {
+            handler.endElement("p");
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endHeader() throws MimeException {
+    }
+
+    public void startMessage() throws MimeException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endMessage() throws MimeException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endMultipart() throws MimeException {
+        inPart = false;
+    }
+
+    public void epilogue(InputStream is) throws MimeException, IOException {
+    }
+
+    /**
+     * Header for the whole message or its parts
+     *
+     * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
+     *     http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
+     * Field.html
+     */
+    public void field(Field field) throws MimeException {
+        // inPart indicates whether these metadata correspond to the
+        // whole message or its parts
+        if (inPart) {
+            return;
+        }
+
+        try {
+            String fieldname = field.getName();
+            ParsedField parsedField = LenientFieldParser.getParser().parse(
+                    field, DecodeMonitor.SILENT);
+            if (fieldname.equalsIgnoreCase("From")) {
+                MailboxListField fromField = (MailboxListField) parsedField;
+                MailboxList mailboxList = fromField.getMailboxList();
+                if (fromField.isValidField() && mailboxList != null) {
+                    for (Address address : mailboxList) {
+                        String from = getDisplayString(address);
+                        metadata.add(Metadata.MESSAGE_FROM, from);
+                        metadata.add(TikaCoreProperties.CREATOR, from);
+                    }
+                } else {
+                    String from = stripOutFieldPrefix(field, "From:");
+                    if (from.startsWith("<")) {
+                        from = from.substring(1);
+                    }
+                    if (from.endsWith(">")) {
+                        from = from.substring(0, from.length() - 1);
+                    }
+                    metadata.add(Metadata.MESSAGE_FROM, from);
+                    metadata.add(TikaCoreProperties.CREATOR, from);
+                }
+            } else if (fieldname.equalsIgnoreCase("Subject")) {
+                metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
+                        ((UnstructuredField) parsedField).getValue());
+            } else if (fieldname.equalsIgnoreCase("To")) {
+                processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+            } else if (fieldname.equalsIgnoreCase("CC")) {
+                processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+            } else if (fieldname.equalsIgnoreCase("BCC")) {
+                processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+            } else if (fieldname.equalsIgnoreCase("Date")) {
+                DateTimeField dateField = (DateTimeField) parsedField;
+                Date date = dateField.getDate();
+                if (date == null) {
+                    date = tryOtherDateFormats(field.getBody());
+                }
+                metadata.set(TikaCoreProperties.CREATED, date);
+            }
+        } catch (RuntimeException me) {
+            if (strictParsing) {
+                throw me;
+            }
+        }
+    }
+
+    private static synchronized Date tryOtherDateFormats(String text) {
+        if (text == null) {
+            return null;
+        }
+        //strip out additional spaces and trim
+        text = text.replaceAll("\\s+", " ").trim();
+
+        //strip out commas
+        text = text.replaceAll(",", "");
+        Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("GMT$1$2:00");
+        }
+
+        matcher = AM_PM.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("$1 $2");
+        }
+
+        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+            try {
+                return format.parse(text);
+            } catch (ParseException e) {
+            }
+        }
+        return null;
+    }
+
+    private void processAddressList(ParsedField field, String addressListType,
+                                    String metadataField) throws MimeException {
+        AddressListField toField = (AddressListField) field;
+        if (toField.isValidField()) {
+            AddressList addressList = toField.getAddressList();
+            for (Address address : addressList) {
+                metadata.add(metadataField, getDisplayString(address));
+            }
+        } else {
+            String to = stripOutFieldPrefix(field,
+                    addressListType);
+            for (String eachTo : to.split(",")) {
+                metadata.add(metadataField, eachTo.trim());
+            }
+        }
+    }
+
+    private String getDisplayString(Address address) {
+        if (address instanceof Mailbox) {
+            Mailbox mailbox = (Mailbox) address;
+            String name = mailbox.getName();
+            if (name != null && name.length() > 0) {
+                name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+                return name + " <" + mailbox.getAddress() + ">";
+            } else {
+                return mailbox.getAddress();
+            }
+        } else {
+            return address.toString();
+        }
+    }
+
+    public void preamble(InputStream is) throws MimeException, IOException {
+    }
+
+    public void raw(InputStream is) throws MimeException, IOException {
+    }
+
+    public void startBodyPart() throws MimeException {
+        try {
+            handler.startElement("div", "class", "email-entry");
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void startHeader() throws MimeException {
+        // TODO Auto-generated method stub
+
+    }
+
+    public void startMultipart(BodyDescriptor descr) throws MimeException {
+        inPart = true;
+    }
+
+    private String stripOutFieldPrefix(Field field, String fieldname) {
+        String temp = field.getRaw().toString();
+        int loc = fieldname.length();
+        while (temp.charAt(loc) == ' ') {
+            loc++;
+        }
+        return temp.substring(loc);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index 9ac02a7..6299d3f 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -1,95 +1,95 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.parser.MimeStreamParser;
-import org.apache.james.mime4j.stream.MimeConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within elements.
- * <p/>
- * A {@link MimeEntityConfig} object can be passed in the parsing context
- * to better control the parsing process.
- *
- * @author jnioche@digitalpebble.com
- */
-public class RFC822Parser extends AbstractParser {
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -5504243905998074168L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections
-            .singleton(MediaType.parse("message/rfc822"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler,
-                      Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-        // Get the mime4j configuration, or use a default one
-        MimeConfig config = new MimeConfig();
-        config.setMaxLineLen(100000);
-        config.setMaxHeaderLen(100000); // max length of any individual header
-        config = context.get(MimeConfig.class, config);
-
-        MimeStreamParser parser = new MimeStreamParser(config);
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
-        MailContentHandler mch = new MailContentHandler(
-                xhtml, metadata, context, config.isStrictParsing());
-        parser.setContentHandler(mch);
-        parser.setContentDecoding(true);
-        
-        TikaInputStream tstream = TikaInputStream.get(stream);
-        try {
-            parser.parse(tstream);
-        } catch (IOException e) {
-            tstream.throwIfCauseOf(e);
-            throw new TikaException("Failed to parse an email message", e);
-        } catch (MimeException e) {
-            // Unwrap the exception in case it was not thrown by mime4j
-            Throwable cause = e.getCause();
-            if (cause instanceof TikaException) {
-                throw (TikaException) cause;
-            } else if (cause instanceof SAXException) {
-                throw (SAXException) cause;
-            } else {
-                throw new TikaException("Failed to parse an email message", e);
-            }
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within elements.
+ * <p/>
+ * A {@link MimeEntityConfig} object can be passed in the parsing context
+ * to better control the parsing process.
+ *
+ * @author jnioche@digitalpebble.com
+ */
+public class RFC822Parser extends AbstractParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -5504243905998074168L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.parse("message/rfc822"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        // Get the mime4j configuration, or use a default one
+        MimeConfig config = new MimeConfig();
+        config.setMaxLineLen(100000);
+        config.setMaxHeaderLen(100000); // max length of any individual header
+        config = context.get(MimeConfig.class, config);
+
+        MimeStreamParser parser = new MimeStreamParser(config);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        MailContentHandler mch = new MailContentHandler(
+                xhtml, metadata, context, config.isStrictParsing());
+        parser.setContentHandler(mch);
+        parser.setContentDecoding(true);
+        
+        TikaInputStream tstream = TikaInputStream.get(stream);
+        try {
+            parser.parse(tstream);
+        } catch (IOException e) {
+            tstream.throwIfCauseOf(e);
+            throw new TikaException("Failed to parse an email message", e);
+        } catch (MimeException e) {
+            // Unwrap the exception in case it was not thrown by mime4j
+            Throwable cause = e.getCause();
+            if (cause instanceof TikaException) {
+                throw (TikaException) cause;
+            } else if (cause instanceof SAXException) {
+                throw (SAXException) cause;
+            } else {
+                throw new TikaException("Failed to parse an email message", e);
+            }
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
index 5be4b0b..cc10dd2 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
@@ -1,75 +1,75 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.feed;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class FeedParserTest {
-    @Test
-    public void testRSSParser() throws Exception {
-        try (InputStream input = FeedParserTest.class.getResourceAsStream(
-                "/test-documents/rsstest.rss")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            ParseContext context = new ParseContext();
-
-            new FeedParser().parse(input, handler, metadata, context);
-
-            String content = handler.toString();
-            assertFalse(content == null);
-
-            assertEquals("Sample RSS File for Junit test",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
-
-            // TODO find a way of testing the paragraphs and anchors
-        }
-    }
-
-
-    @Test
-    public void testAtomParser() throws Exception {
-        try (InputStream input = FeedParserTest.class.getResourceAsStream(
-                "/test-documents/testATOM.atom")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            ParseContext context = new ParseContext();
-
-            new FeedParser().parse(input, handler, metadata, context);
-
-            String content = handler.toString();
-            assertFalse(content == null);
-
-            assertEquals("Sample Atom File for Junit test",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
-
-            // TODO Check some more
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest {
+    @Test
+    public void testRSSParser() throws Exception {
+        try (InputStream input = FeedParserTest.class.getResourceAsStream(
+                "/test-documents/rsstest.rss")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+
+            new FeedParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertFalse(content == null);
+
+            assertEquals("Sample RSS File for Junit test",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
+
+            // TODO find a way of testing the paragraphs and anchors
+        }
+    }
+
+
+    @Test
+    public void testAtomParser() throws Exception {
+        try (InputStream input = FeedParserTest.class.getResourceAsStream(
+                "/test-documents/testATOM.atom")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+
+            new FeedParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertFalse(content == null);
+
+            assertEquals("Sample Atom File for Junit test",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
+
+            // TODO Check some more
+        }
+    }
+
+}

[16/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 3d28b35..25e567f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -1,382 +1,382 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.util.List;
-
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.junit.Test;
-
-/**
- * Tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
-
-    /**
-     * For office files which don't have anything embedded in them
-     */
-    @Test
-    public void testWithoutEmbedded() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-
-        String[] files = new String[]{
-                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
-                "testVISIO.vsd", "test-outlook.msg"
-        };
-        for (String file : files) {
-            // Process it without recursing
-            TrackingHandler handler = process(file, extractor, false);
-
-            // Won't have fired
-            assertEquals(0, handler.filenames.size());
-            assertEquals(0, handler.mediaTypes.size());
-
-            // Ditto with recursing
-            handler = process(file, extractor, true);
-            assertEquals(0, handler.filenames.size());
-            assertEquals(0, handler.mediaTypes.size());
-        }
-    }
-
-    /**
-     * Office files with embedded images, but no other
-     * office files in them
-     */
-    @Test
-    public void testEmbeddedImages() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        // Excel with 1 image
-        handler = process("testEXCEL_1img.xls", extractor, false);
-        assertEquals(1, handler.filenames.size());
-        assertEquals(1, handler.mediaTypes.size());
-
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
-        // PowerPoint with 2 images + sound
-        // TODO
-
-
-        // Word with 1 image
-        handler = process("testWORD_1img.doc", extractor, false);
-        assertEquals(1, handler.filenames.size());
-        assertEquals(1, handler.mediaTypes.size());
-
-        assertEquals("image1.png", handler.filenames.get(0));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
-        // Word with 3 images
-        handler = process("testWORD_3imgs.doc", extractor, false);
-        assertEquals(3, handler.filenames.size());
-        assertEquals(3, handler.mediaTypes.size());
-
-        assertEquals("image1.png", handler.filenames.get(0));
-        assertEquals("image2.jpg", handler.filenames.get(1));
-        assertEquals("image3.png", handler.filenames.get(2));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
-    }
-
-    /**
-     * Office files which have other office files
-     * embedded into them. The embedded office files
-     * will sometimes have images in them.
-     * <p/>
-     * eg xls
-     * -> word
-     * -> image
-     * -> image
-     * -> powerpoint
-     * -> excel
-     * -> image
-     */
-    @Test
-    public void testEmbeddedOfficeFiles() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-
-        // Excel with a word doc and a powerpoint doc, both of which have images in them
-        // Without recursion, should see both documents + the images
-        handler = process("testEXCEL_embeded.xls", extractor, false);
-        assertEquals(5, handler.filenames.size());
-        assertEquals(5, handler.mediaTypes.size());
-
-        // We don't know their filenames
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
-
-
-        // With recursion, should get the images embedded in the office files too
-        handler = process("testEXCEL_embeded.xls", extractor, true);
-        assertEquals(17, handler.filenames.size());
-        assertEquals(17, handler.mediaTypes.size());
-
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-        assertEquals("1", handler.filenames.get(4));
-        assertEquals(null, handler.filenames.get(5));
-        assertEquals("2", handler.filenames.get(6));
-        assertEquals("image1.png", handler.filenames.get(7));
-        assertEquals("image2.jpg", handler.filenames.get(8));
-        assertEquals("image3.png", handler.filenames.get(9));
-        assertEquals("image1.png", handler.filenames.get(16));
-
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
-
-        // Word with .docx, powerpoint and excel
-        handler = process("testWORD_embeded.doc", extractor, false);
-        assertEquals(9, handler.filenames.size());
-        assertEquals(9, handler.mediaTypes.size());
-
-        // Filenames are a bit iffy...
-        // Should really be 3*embedded pictures then 3*icons then embedded docs
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("image4.png", handler.filenames.get(1));
-        assertEquals("image5.jpg", handler.filenames.get(2));
-        assertEquals("image6.png", handler.filenames.get(3));
-        assertEquals("image2.emf", handler.filenames.get(4));
-        assertEquals("image3.emf", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals("_1345471035.ppt", handler.filenames.get(7));
-        assertEquals("_1345470949.xls", handler.filenames.get(8));
-
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
-
-
-        // With recursion, should get their images too
-        handler = process("testWORD_embeded.doc", extractor, true);
-        assertEquals(16, handler.filenames.size());
-        assertEquals(16, handler.mediaTypes.size());
-
-        // We don't know their filenames, except for doc images + docx
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("image4.png", handler.filenames.get(1));
-        assertEquals("image5.jpg", handler.filenames.get(2));
-        assertEquals("image6.png", handler.filenames.get(3));
-        assertEquals("image2.emf", handler.filenames.get(4));
-        assertEquals("image3.emf", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals("image2.png", handler.filenames.get(7));
-        assertEquals("image3.jpeg", handler.filenames.get(8));
-        assertEquals("image4.png", handler.filenames.get(9));
-        for (int i = 11; i < 14; i++) {
-            assertNull(handler.filenames.get(i));
-        }
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside .docx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside .xls
-
-
-        // PowerPoint with excel and word
-        handler = process("testPPT_embeded.ppt", extractor, false);
-        assertEquals(7, handler.filenames.size());
-        assertEquals(7, handler.mediaTypes.size());
-
-        // We don't get all that helpful filenames
-        assertEquals("1", handler.filenames.get(0));
-        assertEquals("2", handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals(null, handler.filenames.get(3));
-        assertEquals(null, handler.filenames.get(4));
-        assertEquals(null, handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        // But we do know their types
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
-
-        // Run again on PowerPoint but with recursion
-        handler = process("testPPT_embeded.ppt", extractor, true);
-        assertEquals(11, handler.filenames.size());
-        assertEquals(11, handler.mediaTypes.size());
-
-        assertEquals("1", handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals("2", handler.filenames.get(2));
-        assertEquals("image1.png", handler.filenames.get(3));
-        assertEquals("image2.jpg", handler.filenames.get(4));
-        assertEquals("image3.png", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals(null, handler.filenames.get(7));
-        assertEquals(null, handler.filenames.get(8));
-        assertEquals(null, handler.filenames.get(9));
-        assertEquals(null, handler.filenames.get(10));
-
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside .xls
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside .docx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside .docx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside .docx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
-
-
-        // Word, with a non-office file (PDF)
-        handler = process("testWORD_embedded_pdf.doc", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("_1402837031.pdf", handler.filenames.get(1));
-
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
-        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
-
-
-        // Outlook with a text file and a word document
-        handler = process("testMSG_att_doc.msg", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("test-unicode.doc", handler.filenames.get(0));
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
-
-        assertEquals("pj1.txt", handler.filenames.get(1));
-        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
-
-
-        // Outlook with a pdf and another outlook message
-        handler = process("testMSG_att_msg.msg", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
-        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
-
-        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
-        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
-    }
-
-    @Test
-    public void testEmbeddedOfficeFilesXML() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        handler = process("EmbeddedDocument.docx", extractor, false);
-        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
-        assertEquals(2, handler.filenames.size());
-    }
-
-    @Test
-    public void testPowerpointImages() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        handler = process("pictures.ppt", extractor, false);
-        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
-        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
-    }
-
-    @Test
-    public void testEmbeddedStorageId() throws Exception {
-
-        List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
-        //.docx
-        assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
-                list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-        //_1345471035.ppt
-        assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
-                list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-        //_1345470949.xls
-        assertEquals("{00020820-0000-0000-C000-000000000046}",
-                list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-
-    }
-
-    @Test
-    public void testEmbeddedGraphChart() throws Exception {
-        //doc converts a chart to a actual xls file
-        //so we only need to look in ppt and xls
-        for (String suffix : new String[]{"ppt", "xls"}) {
-            List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
-            boolean found = false;
-            for (Metadata m : list) {
-                if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
-                    found = true;
-                }
-                assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
-            }
-            assertTrue("didn't find chart in "+suffix, found);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+/**
+ * Tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
+
+    /**
+     * For office files which don't have anything embedded in them
+     */
+    @Test
+    public void testWithoutEmbedded() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        String[] files = new String[]{
+                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
+                "testVISIO.vsd", "test-outlook.msg"
+        };
+        for (String file : files) {
+            // Process it without recursing
+            TrackingHandler handler = process(file, extractor, false);
+
+            // Won't have fired
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+
+            // Ditto with recursing
+            handler = process(file, extractor, true);
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+        }
+    }
+
+    /**
+     * Office files with embedded images, but no other
+     * office files in them
+     */
+    @Test
+    public void testEmbeddedImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        // Excel with 1 image
+        handler = process("testEXCEL_1img.xls", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // PowerPoint with 2 images + sound
+        // TODO
+
+
+        // Word with 1 image
+        handler = process("testWORD_1img.doc", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // Word with 3 images
+        handler = process("testWORD_3imgs.doc", extractor, false);
+        assertEquals(3, handler.filenames.size());
+        assertEquals(3, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals("image2.jpg", handler.filenames.get(1));
+        assertEquals("image3.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+    }
+
+    /**
+     * Office files which have other office files
+     * embedded into them. The embedded office files
+     * will sometimes have images in them.
+     * <p/>
+     * eg xls
+     * -> word
+     * -> image
+     * -> image
+     * -> powerpoint
+     * -> excel
+     * -> image
+     */
+    @Test
+    public void testEmbeddedOfficeFiles() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+
+        // Excel with a word doc and a powerpoint doc, both of which have images in them
+        // Without recursion, should see both documents + the images
+        handler = process("testEXCEL_embeded.xls", extractor, false);
+        assertEquals(5, handler.filenames.size());
+        assertEquals(5, handler.mediaTypes.size());
+
+        // We don't know their filenames
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
+
+
+        // With recursion, should get the images embedded in the office files too
+        handler = process("testEXCEL_embeded.xls", extractor, true);
+        assertEquals(17, handler.filenames.size());
+        assertEquals(17, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("1", handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals("2", handler.filenames.get(6));
+        assertEquals("image1.png", handler.filenames.get(7));
+        assertEquals("image2.jpg", handler.filenames.get(8));
+        assertEquals("image3.png", handler.filenames.get(9));
+        assertEquals("image1.png", handler.filenames.get(16));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
+
+        // Word with .docx, powerpoint and excel
+        handler = process("testWORD_embeded.doc", extractor, false);
+        assertEquals(9, handler.filenames.size());
+        assertEquals(9, handler.mediaTypes.size());
+
+        // Filenames are a bit iffy...
+        // Should really be 3*embedded pictures then 3*icons then embedded docs
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("_1345471035.ppt", handler.filenames.get(7));
+        assertEquals("_1345470949.xls", handler.filenames.get(8));
+
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
+
+
+        // With recursion, should get their images too
+        handler = process("testWORD_embeded.doc", extractor, true);
+        assertEquals(16, handler.filenames.size());
+        assertEquals(16, handler.mediaTypes.size());
+
+        // We don't know their filenames, except for doc images + docx
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("image2.png", handler.filenames.get(7));
+        assertEquals("image3.jpeg", handler.filenames.get(8));
+        assertEquals("image4.png", handler.filenames.get(9));
+        for (int i = 11; i < 14; i++) {
+            assertNull(handler.filenames.get(i));
+        }
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside .docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside .xls
+
+
+        // PowerPoint with excel and word
+        handler = process("testPPT_embeded.ppt", extractor, false);
+        assertEquals(7, handler.filenames.size());
+        assertEquals(7, handler.mediaTypes.size());
+
+        // We don't get all that helpful filenames
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals("2", handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals(null, handler.filenames.get(3));
+        assertEquals(null, handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        // But we do know their types
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
+
+        // Run again on PowerPoint but with recursion
+        handler = process("testPPT_embeded.ppt", extractor, true);
+        assertEquals(11, handler.filenames.size());
+        assertEquals(11, handler.mediaTypes.size());
+
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals("2", handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.jpg", handler.filenames.get(4));
+        assertEquals("image3.png", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals(null, handler.filenames.get(7));
+        assertEquals(null, handler.filenames.get(8));
+        assertEquals(null, handler.filenames.get(9));
+        assertEquals(null, handler.filenames.get(10));
+
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside .xls
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside .docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside .docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside .docx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
+
+
+        // Word, with a non-office file (PDF)
+        handler = process("testWORD_embedded_pdf.doc", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("_1402837031.pdf", handler.filenames.get(1));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
+
+
+        // Outlook with a text file and a word document
+        handler = process("testMSG_att_doc.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("test-unicode.doc", handler.filenames.get(0));
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
+
+        assertEquals("pj1.txt", handler.filenames.get(1));
+        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
+
+
+        // Outlook with a pdf and another outlook message
+        handler = process("testMSG_att_msg.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
+
+        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+    }
+
+    @Test
+    public void testEmbeddedOfficeFilesXML() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("EmbeddedDocument.docx", extractor, false);
+        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
+        assertEquals(2, handler.filenames.size());
+    }
+
+    @Test
+    public void testPowerpointImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("pictures.ppt", extractor, false);
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+    }
+
+    @Test
+    public void testEmbeddedStorageId() throws Exception {
+
+        List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
+        //.docx
+        assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
+                list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345471035.ppt
+        assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
+                list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345470949.xls
+        assertEquals("{00020820-0000-0000-C000-000000000046}",
+                list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+
+    }
+
+    @Test
+    public void testEmbeddedGraphChart() throws Exception {
+        //doc converts a chart to a actual xls file
+        //so we only need to look in ppt and xls
+        for (String suffix : new String[]{"ppt", "xls"}) {
+            List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
+            boolean found = false;
+            for (Metadata m : list) {
+                if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
+                    found = true;
+                }
+                assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+            }
+            assertTrue("didn't find chart in "+suffix, found);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 32d462e..79d53d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -1,251 +1,251 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PowerPointParserTest extends TikaTest {
-
-    @Test
-    public void testPowerPointParser() throws Exception {
-        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT.ppt")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/vnd.ms-powerpoint",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = handler.toString();
-            assertContains("Sample Powerpoint Slide", content);
-            assertContains("Powerpoint X for Mac", content);
-        }
-    }
-
-    @Test
-    public void testVarious() throws Exception {
-        Metadata metadata = new Metadata();
-        String xml = getXML("testPPT_various.ppt", metadata).xml;
-        assertContains("<p>Footnote appears here", xml);
-        assertContains("<p>[1] This is a footnote.", xml);
-        assertContains("<p>This is the header text.</p>", xml);
-        assertContains("<p>This is the footer text.</p>", xml);
-        assertContains("<p>Here is a text box</p>", xml);
-        assertContains("<p>Bold ", xml);
-        assertContains("italic underline superscript subscript", xml);
-        assertContains("underline", xml);
-        assertContains("superscript", xml);
-        assertContains("subscript", xml);
-        assertContains("<p>Here is a citation:", xml);
-        assertContains("Figure 1 This is a caption for Figure 1", xml);
-        assertContains("(Kramer)", xml);
-        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
-        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
-        assertContains("<p>Row 1 column 1</p>", xml);
-        assertContains("<p>Row 2 column 2</p>", xml);
-        assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
-        assertContains("<p>Here is a list:", xml);
-        for(int row=1;row<=3;row++) {
-            //assertContains("�\tBullet " + row, content);
-            //assertContains("\u00b7\tBullet " + row, content);
-            assertContains("<li>Bullet " + row, xml);
-        }
-        assertContains("Here is a numbered list:", xml);
-        for(int row=1;row<=3;row++) {
-            //assertContains(row + ")\tNumber bullet " + row, content);
-            //assertContains(row + ") Number bullet " + row, content);
-            // TODO: OOXMLExtractor fails to number the bullets:
-            assertContains("<li>Number bullet " + row, xml);
-        }
-
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
-                assertContains("Row " + row + " Col " + col, xml);
-            }
-        }
-        assertContains("Keyword1 Keyword2", xml);
-        assertEquals("Keyword1 Keyword2",
-                     metadata.get(TikaCoreProperties.KEYWORDS));
-
-        assertContains("Subject is here", xml);
-        assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
-        // TODO: Remove subject in Tika 2.0
-        assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
-
-        assertContains("Suddenly some Japanese text:", xml);
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
-        // 6 other characters
-        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
-                xml);
-
-        assertContains("And then some Gothic text:", xml);
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
-                xml);
-    }
-
-    @Test
-    public void testMasterFooter() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterFooter.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Master footer is here", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    /**
-     * TIKA-712 Master Slide Text from PPT and PPTX files
-     *  should be extracted too
-     */
-    @Test
-    public void testMasterText() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterText.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Text that I added to the master slide", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    @Test
-    public void testMasterText2() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterText2.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Text that I added to the master slide", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    /**
-     * Ensures that custom OLE2 (HPSF) properties are extracted
-     */
-    @Test
-    public void testCustomProperties() throws Exception {
-        Metadata metadata = new Metadata();
-
-        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_custom_props.ppt")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            new OfficeParser().parse(input, handler, metadata, context);
-        }
-
-        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
-        assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
-        assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
-        assertEquals("3", metadata.get(Office.WORD_COUNT));
-        assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("true", metadata.get("custom:myCustomBoolean"));
-        assertEquals("3", metadata.get("custom:myCustomNumber"));
-        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
-        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
-        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
-    }
-
-    // TIKA-1025
-    @Test
-    public void testEmbeddedPlacedholder() throws Exception {
-        XMLResult result = getXML("testPPT_embedded2.ppt");
-        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
-        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
-    }
-
-    // TIKA-817
-    @Test
-    public void testAutoDatePPT() throws Exception {
-        //decision was made in POI-52367 not to generate
-        //autodate automatically.  For pptx, where value is stored,
-        //value is extracted.  For ppt, however, no date is extracted.
-        XMLResult result = getXML("testPPT_autodate.ppt");
-        assertContains(
-                "<div class=\"slide-content\"><p>Now</p>",
-                result.xml);
-    }
-
-    @Test
-    public void testCommentAuthorship() throws Exception {
-        XMLResult r = getXML("testPPT_comment.ppt");
-        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
-    }
-
-    @Test
-    public void testEmbeddedPDF() throws Exception {
-        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
-        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
-        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
-        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
-        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PowerPointParserTest extends TikaTest {
+
+    @Test
+    public void testPowerPointParser() throws Exception {
+        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT.ppt")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/vnd.ms-powerpoint",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("Sample Powerpoint Slide", content);
+            assertContains("Powerpoint X for Mac", content);
+        }
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Metadata metadata = new Metadata();
+        String xml = getXML("testPPT_various.ppt", metadata).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        assertContains("<p>Bold ", xml);
+        assertContains("italic underline superscript subscript", xml);
+        assertContains("underline", xml);
+        assertContains("superscript", xml);
+        assertContains("subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+        assertContains("<p>Here is a list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains("�\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("<li>Bullet " + row, xml);
+        }
+        assertContains("Here is a numbered list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("<li>Number bullet " + row, xml);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, xml);
+            }
+        }
+        assertContains("Keyword1 Keyword2", xml);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", xml);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", xml);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
+                xml);
+
+        assertContains("And then some Gothic text:", xml);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
+                xml);
+    }
+
+    @Test
+    public void testMasterFooter() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterFooter.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Master footer is here", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * TIKA-712 Master Slide Text from PPT and PPTX files
+     *  should be extracted too
+     */
+    @Test
+    public void testMasterText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    @Test
+    public void testMasterText2() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText2.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_custom_props.ppt")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Office.WORD_COUNT));
+        assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    // TIKA-1025
+    @Test
+    public void testEmbeddedPlacedholder() throws Exception {
+        XMLResult result = getXML("testPPT_embedded2.ppt");
+        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
+    }
+
+    // TIKA-817
+    @Test
+    public void testAutoDatePPT() throws Exception {
+        //decision was made in POI-52367 not to generate
+        //autodate automatically.  For pptx, where value is stored,
+        //value is extracted.  For ppt, however, no date is extracted.
+        XMLResult result = getXML("testPPT_autodate.ppt");
+        assertContains(
+                "<div class=\"slide-content\"><p>Now</p>",
+                result.xml);
+    }
+
+    @Test
+    public void testCommentAuthorship() throws Exception {
+        XMLResult r = getXML("testPPT_comment.ppt");
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
+    }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
index a3ccefc..a37e44d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
@@ -1,53 +1,53 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PublisherParserTest {
-
-    @Test
-    public void testPublisherParser() throws Exception {
-        try (InputStream input = PublisherParserTest.class.getResourceAsStream(
-                "/test-documents/testPUBLISHER.pub")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/x-mspublisher",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
-            String content = handler.toString();
-            assertContains("0123456789", content);
-            assertContains("abcdef", content);
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PublisherParserTest {
+
+    @Test
+    public void testPublisherParser() throws Exception {
+        try (InputStream input = PublisherParserTest.class.getResourceAsStream(
+                "/test-documents/testPUBLISHER.pub")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/x-mspublisher",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("0123456789", content);
+            assertContains("abcdef", content);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
index 4edb5ee..8062555 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
@@ -1,98 +1,98 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest.TrackingHandler;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Tests for the TNEF (winmail.dat) parser
- */
-public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
-    private static final String file = "testWINMAIL.dat";
-
-    @Test
-    public void testBasics() throws Exception {
-        Detector detector = new DefaultDetector();
-        try (TikaInputStream stream = getTestFile(file)) {
-            assertEquals(
-                    MediaType.application("vnd.ms-tnef"),
-                    detector.detect(stream, new Metadata()));
-        }
-    }
-
-    @Test
-    public void testMetadata() throws Exception {
-        TikaInputStream stream = getTestFile(file);
-
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-
-        TNEFParser tnef = new TNEFParser();
-        tnef.parse(stream, handler, metadata, new ParseContext());
-
-        assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
-    }
-
-    /**
-     * Check the Rtf and Attachments are returned
-     * as expected
-     */
-    @Test
-    public void testBodyAndAttachments() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-
-        // Process it with recursing
-        // Will have the message body RTF and the attachments
-        TrackingHandler handler = process(file, extractor, true);
-        assertEquals(6, handler.filenames.size());
-        assertEquals(6, handler.mediaTypes.size());
-
-        // We know the filenames for all of them
-        assertEquals("message.rtf", handler.filenames.get(0));
-        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
-
-        assertEquals("quick.doc", handler.filenames.get(1));
-        assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
-
-        assertEquals("quick.html", handler.filenames.get(2));
-        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
-
-        assertEquals("quick.pdf", handler.filenames.get(3));
-        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
-
-        assertEquals("quick.txt", handler.filenames.get(4));
-        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
-
-        assertEquals("quick.xml", handler.filenames.get(5));
-        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for the TNEF (winmail.dat) parser
+ */
+public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
+    private static final String file = "testWINMAIL.dat";
+
+    @Test
+    public void testBasics() throws Exception {
+        Detector detector = new DefaultDetector();
+        try (TikaInputStream stream = getTestFile(file)) {
+            assertEquals(
+                    MediaType.application("vnd.ms-tnef"),
+                    detector.detect(stream, new Metadata()));
+        }
+    }
+
+    @Test
+    public void testMetadata() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        TNEFParser tnef = new TNEFParser();
+        tnef.parse(stream, handler, metadata, new ParseContext());
+
+        assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Check the Rtf and Attachments are returned
+     * as expected
+     */
+    @Test
+    public void testBodyAndAttachments() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        // Process it with recursing
+        // Will have the message body RTF and the attachments
+        TrackingHandler handler = process(file, extractor, true);
+        assertEquals(6, handler.filenames.size());
+        assertEquals(6, handler.mediaTypes.size());
+
+        // We know the filenames for all of them
+        assertEquals("message.rtf", handler.filenames.get(0));
+        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+
+        assertEquals("quick.doc", handler.filenames.get(1));
+        assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
+
+        assertEquals("quick.html", handler.filenames.get(2));
+        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+
+        assertEquals("quick.pdf", handler.filenames.get(3));
+        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+
+        assertEquals("quick.txt", handler.filenames.get(4));
+        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+
+        assertEquals("quick.xml", handler.filenames.get(5));
+        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
index 3002187..06320fe 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
@@ -1,51 +1,51 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class VisioParserTest {
-
-    @Test
-    public void testVisioParser() throws Exception {
-        try (InputStream input = VisioParserTest.class.getResourceAsStream(
-                "/test-documents/testVISIO.vsd")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/vnd.visio",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
-            String content = handler.toString();
-            assertContains("Some random text, on a page", content);
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class VisioParserTest {
+
+    @Test
+    public void testVisioParser() throws Exception {
+        try (InputStream input = VisioParserTest.class.getResourceAsStream(
+                "/test-documents/testVISIO.vsd")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/vnd.visio",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
+            String content = handler.toString();
+            assertContains("Some random text, on a page", content);
+        }
+    }
+
+}

[06/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
index 49afdd7..e1a0ff0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
@@ -1,54 +1,54 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005, International Business Machines Corporation and         *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Abstract class for recognizing a single charset.
- * Part of the implementation of ICU's CharsetDetector.
- *
- * Each specific charset that can be recognized will have an instance
- * of some subclass of this class.  All interaction between the overall
- * CharsetDetector and the stuff specific to an individual charset happens
- * via the interface provided here.
- *
- * Instances of CharsetDetector DO NOT have or maintain 
- * state pertaining to a specific match or detect operation.
- * The WILL be shared by multiple instances of CharsetDetector.
- * They encapsulate const charset-specific information.
- *
- * @internal
- */
-abstract class CharsetRecognizer {
-    /**
-     * Get the IANA name of this charset.
-     * @return the charset name.
-     */
-    abstract String getName();
-
-    /**
-     * Get the ISO language code for this charset.
-     * @return the language code, or <code>null</code> if the language cannot be determined.
-     */
-    public String getLanguage() {
-        return null;
-    }
-
-    /**
-     * Test the match of this charset with the input text data
-     *      which is obtained via the CharsetDetector object.
-     *
-     * @param det  The CharsetDetector, which contains the input text
-     *             to be checked for being in this charset.
-     * @return Two values packed into one int  (Damn java, anyhow)
-     *             <br/>
-     *             bits 0-7:  the match confidence, ranging from 0-100
-     *             <br/>
-     *             bits 8-15: The match reason, an enum-like value.
-     */
-    abstract int match(CharsetDetector det);
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Abstract class for recognizing a single charset.
+ * Part of the implementation of ICU's CharsetDetector.
+ *
+ * Each specific charset that can be recognized will have an instance
+ * of some subclass of this class.  All interaction between the overall
+ * CharsetDetector and the stuff specific to an individual charset happens
+ * via the interface provided here.
+ *
+ * Instances of CharsetDetector DO NOT have or maintain 
+ * state pertaining to a specific match or detect operation.
+ * The WILL be shared by multiple instances of CharsetDetector.
+ * They encapsulate const charset-specific information.
+ *
+ * @internal
+ */
+abstract class CharsetRecognizer {
+    /**
+     * Get the IANA name of this charset.
+     * @return the charset name.
+     */
+    abstract String getName();
+
+    /**
+     * Get the ISO language code for this charset.
+     * @return the language code, or <code>null</code> if the language cannot be determined.
+     */
+    public String getLanguage() {
+        return null;
+    }
+
+    /**
+     * Test the match of this charset with the input text data
+     *      which is obtained via the CharsetDetector object.
+     *
+     * @param det  The CharsetDetector, which contains the input text
+     *             to be checked for being in this charset.
+     * @return Two values packed into one int  (Damn java, anyhow)
+     *             <br/>
+     *             bits 0-7:  the match confidence, ranging from 0-100
+     *             <br/>
+     *             bits 8-15: The match reason, an enum-like value.
+     */
+    abstract int match(CharsetDetector det);
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index f704557..2b20495 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -1,98 +1,98 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Plain text parser. The text encoding of the document stream is
- * automatically detected based on the byte patterns found at the
- * beginning of the stream and the given document metadata, most
- * notably the <code>charset</code> parameter of a
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
- * <p/>
- * This parser sets the following output metadata entries:
- * <dl>
- * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
- * <dd><code>text/plain; charset=...</code></dd>
- * </dl>
- */
-public class TXTParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -6656102320836888910L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MediaType.TEXT_PLAIN);
-
-    private static final ServiceLoader LOADER =
-            new ServiceLoader(TXTParser.class.getClassLoader());
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // Automatically detect the character encoding
-        try (AutoDetectReader reader = new AutoDetectReader(
-                new CloseShieldInputStream(stream), metadata,
-                context.get(ServiceLoader.class, LOADER))) {
-            Charset charset = reader.getCharset();
-            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
-            metadata.set(Metadata.CONTENT_TYPE, type.toString());
-            // deprecated, see TIKA-431
-            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
-            XHTMLContentHandler xhtml =
-                    new XHTMLContentHandler(handler, metadata);
-            xhtml.startDocument();
-
-            xhtml.startElement("p");
-            char[] buffer = new char[4096];
-            int n = reader.read(buffer);
-            while (n != -1) {
-                xhtml.characters(buffer, 0, n);
-                n = reader.read(buffer);
-            }
-            xhtml.endElement("p");
-
-            xhtml.endDocument();
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Plain text parser. The text encoding of the document stream is
+ * automatically detected based on the byte patterns found at the
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
+ * <p/>
+ * This parser sets the following output metadata entries:
+ * <dl>
+ * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
+ * <dd><code>text/plain; charset=...</code></dd>
+ * </dl>
+ */
+public class TXTParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -6656102320836888910L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.TEXT_PLAIN);
+
+    private static final ServiceLoader LOADER =
+            new ServiceLoader(TXTParser.class.getClassLoader());
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(
+                new CloseShieldInputStream(stream), metadata,
+                context.get(ServiceLoader.class, LOADER))) {
+            Charset charset = reader.getCharset();
+            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            XHTMLContentHandler xhtml =
+                    new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+            xhtml.startElement("p");
+            char[] buffer = new char[4096];
+            int n = reader.read(buffer);
+            while (n != -1) {
+                xhtml.characters(buffer, 0, n);
+                n = reader.read(buffer);
+            }
+            xhtml.endElement("p");
+
+            xhtml.endDocument();
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
index 11bea1d..d36f79c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Base class for SAX handlers that map SAX events into document metadata.
- *
- * @since Apache Tika 0.10
- */
-class AbstractMetadataHandler extends DefaultHandler {
-
-    private final Metadata metadata;
-    private final Property property;
-    private final String name;
-
-    protected AbstractMetadataHandler(Metadata metadata, String name) {
-        this.metadata = metadata;
-        this.property = null;
-        this.name = name;
-    }
-    protected AbstractMetadataHandler(Metadata metadata, Property property) {
-       this.metadata = metadata;
-       this.property = property;
-       this.name = property.getName();
-   }
-
-    /**
-     * Adds the given metadata value. The value is ignored if it is
-     * <code>null</code> or empty. If the metadata entry already exists,
-     * then the given value is appended to it with a comma as the separator.
-     *
-     * @param value metadata value
-     */
-    protected void addMetadata(String value) {
-        if (value != null && value.length() > 0) {
-            if (metadata.isMultiValued(name)) {
-                // Add the value, assuming it's not already there
-                List<String> previous = Arrays.asList(metadata.getValues(name));
-                if (!previous.contains(value)) {
-                    if (property != null) {
-                       metadata.add(property, value);
-                    } else {
-                       metadata.add(name, value);
-                    }
-                }
-            } else {
-                // Set the value, assuming it's not already there
-                String previous = metadata.get(name);
-                if (previous != null && previous.length() > 0) {
-                    if (!previous.equals(value)) {
-                       if (property != null) {
-                          if (property.isMultiValuePermitted()) {
-                              metadata.add(property, value);
-                          } else {
-                              // Replace the existing value if isMultiValuePermitted is false
-                              metadata.set(property, value);
-                          }
-                       } else {
-                          metadata.add(name, value);
-                       }
-                    }
-                } else {
-                   if (property != null) {
-                      metadata.set(property, value);
-                   } else {
-                      metadata.set(name, value);
-                   }
-                }
-            }
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+    private final Property property;
+    private final String name;
+
+    protected AbstractMetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    protected AbstractMetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    /**
+     * Adds the given metadata value. The value is ignored if it is
+     * <code>null</code> or empty. If the metadata entry already exists,
+     * then the given value is appended to it with a comma as the separator.
+     *
+     * @param value metadata value
+     */
+    protected void addMetadata(String value) {
+        if (value != null && value.length() > 0) {
+            if (metadata.isMultiValued(name)) {
+                // Add the value, assuming it's not already there
+                List<String> previous = Arrays.asList(metadata.getValues(name));
+                if (!previous.contains(value)) {
+                    if (property != null) {
+                       metadata.add(property, value);
+                    } else {
+                       metadata.add(name, value);
+                    }
+                }
+            } else {
+                // Set the value, assuming it's not already there
+                String previous = metadata.get(name);
+                if (previous != null && previous.length() > 0) {
+                    if (!previous.equals(value)) {
+                       if (property != null) {
+                          if (property.isMultiValuePermitted()) {
+                              metadata.add(property, value);
+                          } else {
+                              // Replace the existing value if isMultiValuePermitted is false
+                              metadata.set(property, value);
+                          }
+                       } else {
+                          metadata.add(name, value);
+                       }
+                    }
+                } else {
+                   if (property != null) {
+                      metadata.set(property, value);
+                   } else {
+                      metadata.set(name, value);
+                   }
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
index 2c6b054..c1795fa 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
@@ -1,82 +1,82 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * This adds a Metadata entry for a given node.
- * The textual content of the node is used as the
- *  value, and the Metadata name is taken from
- *  an attribute, with a prefix if required. 
- */
-public class AttributeDependantMetadataHandler extends DefaultHandler {
-
-    private final Metadata metadata;
-
-    private final String nameHoldingAttribute;
-    private final String namePrefix;
-    private String name;
-
-    private final StringBuilder buffer = new StringBuilder();
-
-    public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
-        this.metadata = metadata;
-        this.nameHoldingAttribute = nameHoldingAttribute;
-        this.namePrefix = namePrefix;
-    }
-
-    public void addMetadata(String value) {
-        if(name == null || name.length() == 0) {
-           // We didn't find the attribute which holds the name
-           return;
-        }
-        if (value.length() > 0) {
-            String previous = metadata.get(name);
-            if (previous != null && previous.length() > 0) {
-                value = previous + ", " + value;
-            }
-            metadata.set(name, value);
-        }
-    }
-
-    public void endElement(String uri, String localName, String name) {
-        addMetadata(buffer.toString());
-        buffer.setLength(0);
-    }
-
-    public void startElement(
-            String uri, String localName, String name, Attributes attributes) {
-        String rawName = attributes.getValue(nameHoldingAttribute);
-        if (rawName != null) {
-           if (namePrefix == null) {
-              this.name = rawName;
-           } else {
-              this.name = namePrefix + rawName;
-           }
-        }
-        // All other attributes are ignored
-    }
-
-    
-    public void characters(char[] ch, int start, int length) {
-        buffer.append(ch, start, length);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ *  value, and the Metadata name is taken from
+ *  an attribute, with a prefix if required. 
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final String nameHoldingAttribute;
+    private final String namePrefix;
+    private String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+        this.metadata = metadata;
+        this.nameHoldingAttribute = nameHoldingAttribute;
+        this.namePrefix = namePrefix;
+    }
+
+    public void addMetadata(String value) {
+        if(name == null || name.length() == 0) {
+           // We didn't find the attribute which holds the name
+           return;
+        }
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            metadata.set(name, value);
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        String rawName = attributes.getValue(nameHoldingAttribute);
+        if (rawName != null) {
+           if (namePrefix == null) {
+              this.name = rawName;
+           } else {
+              this.name = namePrefix + rawName;
+           }
+        }
+        // All other attributes are ignored
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
index 0140421..dba5e4c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
@@ -1,61 +1,61 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * SAX event handler that maps the contents of an XML attribute into
- * a metadata field.
- *
- * @since Apache Tika 0.10
- */
-public class AttributeMetadataHandler extends AbstractMetadataHandler {
-
-    private final String uri;
-
-    private final String localName;
-
-    public AttributeMetadataHandler(
-            String uri, String localName, Metadata metadata, String name) {
-        super(metadata, name);
-        this.uri = uri;
-        this.localName = localName;
-    }
-    public AttributeMetadataHandler(
-          String uri, String localName, Metadata metadata, Property property) {
-      super(metadata, property);
-      this.uri = uri;
-      this.localName = localName;
-  }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String qName, Attributes attributes)
-            throws SAXException {
-        for (int i = 0; i < attributes.getLength(); i++) {
-            if (attributes.getURI(i).equals(this.uri)
-                    && attributes.getLocalName(i).equals(this.localName)) {
-                addMetadata(attributes.getValue(i).trim());
-            }
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+    private final String uri;
+
+    private final String localName;
+
+    public AttributeMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+    }
+    public AttributeMetadataHandler(
+          String uri, String localName, Metadata metadata, Property property) {
+      super(metadata, property);
+      this.uri = uri;
+      this.localName = localName;
+  }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            if (attributes.getURI(i).equals(this.uri)
+                    && attributes.getLocalName(i).equals(this.localName)) {
+                addMetadata(attributes.getValue(i).trim());
+            }
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 9e27801..5999773 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -1,60 +1,60 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.TeeContentHandler;
-import org.xml.sax.ContentHandler;
-
-/**
- * Dublin Core metadata parser
- */
-public class DcXMLParser extends XMLParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 4905318835463880819L;
-
-    private static ContentHandler getDublinCoreHandler(
-            Metadata metadata, Property property, String element) {
-        return new ElementMetadataHandler(
-                DublinCore.NAMESPACE_URI_DC, element,
-                metadata, property);
-    }
-
-    protected ContentHandler getContentHandler(
-            ContentHandler handler, Metadata metadata, ParseContext context) {
-        return new TeeContentHandler(
-                super.getContentHandler(handler, metadata, context),
-                getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
-                getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 4905318835463880819L;
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TeeContentHandler(
+                super.getContentHandler(handler, metadata, context),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
index b69f65b..d5bfb1c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
@@ -1,255 +1,255 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.util.Arrays;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-
-/**
- * SAX event handler that maps the contents of an XML element into
- * a metadata field.
- *
- * @since Apache Tika 0.10
- */
-public class ElementMetadataHandler extends AbstractMetadataHandler {
-    /**
-     * Logger for this class
-     */
-    private static final Log logger = LogFactory
-            .getLog(ElementMetadataHandler.class);
-
-    private static final String LOCAL_NAME_RDF_BAG = "Bag";
-    private static final String LOCAL_NAME_RDF_LI = "li";
-    private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
-    private final String uri;
-
-    private final String localName;
-
-    private final Metadata metadata;
-
-    private final String name;
-    private Property targetProperty;
-
-    private final boolean allowDuplicateValues;
-    private final boolean allowEmptyValues;
-
-    /**
-     * The buffer used to capture characters when inside a bag li element.
-     */
-    private final StringBuilder bufferBagged = new StringBuilder();
-
-    /**
-     * The buffer used to capture characters inside standard elements.
-     */
-    private final StringBuilder bufferBagless = new StringBuilder();
-
-    /**
-     * Whether or not the value was found in a standard element structure or inside a bag.
-     */
-    private boolean isBagless = true;
-
-    private int matchLevel = 0;
-    private int parentMatchLevel = 0;
-
-    /**
-     * Constructor for string metadata keys.
-     *
-     * @param uri the uri of the namespace of the element
-     * @param localName the local name of the element
-     * @param metadata the Tika metadata object to populate
-     * @param name the Tika metadata field key
-     */
-    public ElementMetadataHandler(
-            String uri, String localName, Metadata metadata, String name) {
-        super(metadata, name);
-        this.uri = uri;
-        this.localName = localName;
-        this.metadata = metadata;
-        this.name = name;
-        this.allowDuplicateValues = false;
-        this.allowEmptyValues = false;
-        if (logger.isTraceEnabled()) {
-            logger.trace("created simple handler for " + this.name);
-        }
-    }
-
-    /**
-     * Constructor for string metadata keys which allows change of behavior
-     * for duplicate and empty entry values.
-     *
-     * @param uri the uri of the namespace of the element
-     * @param localName the local name of the element
-     * @param metadata the Tika metadata object to populate
-     * @param name the Tika metadata field key
-     * @param allowDuplicateValues add duplicate values to the Tika metadata
-     * @param allowEmptyValues add empty values to the Tika metadata
-     */
-    public ElementMetadataHandler(
-            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
-        super(metadata, name);
-        this.uri = uri;
-        this.localName = localName;
-        this.metadata = metadata;
-        this.name = name;
-        this.allowDuplicateValues = allowDuplicateValues;
-        this.allowEmptyValues = allowEmptyValues;
-        if (logger.isTraceEnabled()) {
-                logger.trace("created simple handler for " + this.name);
-        }
-    }
-
-    /**
-     * Constructor for Property metadata keys.
-     *
-     * @param uri the uri of the namespace of the element
-     * @param localName the local name of the element
-     * @param metadata the Tika metadata object to populate
-     * @param targetProperty the Tika metadata Property key
-     */
-    public ElementMetadataHandler(
-            String uri, String localName, Metadata metadata, Property targetProperty) {
-        super(metadata, targetProperty);
-        this.uri = uri;
-        this.localName = localName;
-        this.metadata = metadata;
-        this.targetProperty = targetProperty;
-        this.name = targetProperty.getName();
-        this.allowDuplicateValues = false;
-        this.allowEmptyValues = false;
-        if (logger.isTraceEnabled()) {
-            logger.trace("created property handler for " + this.name);
-        }
-    }
-
-    /**
-     * Constructor for Property metadata keys which allows change of behavior
-     * for duplicate and empty entry values.
-     *
-     * @param uri the uri of the namespace of the element
-     * @param localName the local name of the element
-     * @param metadata the Tika metadata object to populate
-     * @param targetProperty the Tika metadata Property key
-     * @param allowDuplicateValues add duplicate values to the Tika metadata
-     * @param allowEmptyValues add empty values to the Tika metadata
-     */
-    public ElementMetadataHandler(
-            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
-        super(metadata, targetProperty);
-        this.uri = uri;
-        this.localName = localName;
-        this.metadata = metadata;
-        this.targetProperty = targetProperty;
-        this.name = targetProperty.getName();
-        this.allowDuplicateValues = allowDuplicateValues;
-        this.allowEmptyValues = allowEmptyValues;
-        if (logger.isTraceEnabled()) {
-                logger.trace("created property handler for " + this.name);
-        }
-    }
-
-    protected boolean isMatchingParentElement(String uri, String localName) {
-        return (uri.equals(this.uri) && localName.equals(this.localName));
-    }
-
-    protected boolean isMatchingElement(String uri, String localName) {
-        // match if we're inside the parent element or within some bag element
-        return (uri.equals(this.uri) && localName.equals(this.localName)) ||
-                (parentMatchLevel > 0 &&
-                        ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
-                        (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
-                )
-        );
-    }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String name, Attributes attributes) {
-        if (isMatchingElement(uri, localName)) {
-            matchLevel++;
-        }
-        if (isMatchingParentElement(uri, localName)) {
-            parentMatchLevel++;
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String name) {
-        if (isMatchingParentElement(uri, localName)) {
-            parentMatchLevel--;
-        }
-        if (isMatchingElement(uri, localName)) {
-            matchLevel--;
-            if (matchLevel == 2) {
-                // we're inside a bag li element, add the bagged buffer
-                addMetadata(bufferBagged.toString().trim());
-                bufferBagged.setLength(0);
-                isBagless = false;
-            }
-            if (matchLevel == 0 && isBagless) {
-                String valueBagless = bufferBagless.toString();
-                if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
-                    // we're in a standard element, add the bagless buffer
-                    addMetadata(valueBagless.trim());
-                    bufferBagless.setLength(0);
-                }
-                isBagless = true;
-            }
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) {
-        // We need to append to both buffers since we don't if we're inside a bag until we're done
-        if (parentMatchLevel > 0 && matchLevel > 2) {
-            bufferBagged.append(ch, start, length);
-        }
-        if (parentMatchLevel > 0 && matchLevel > 0) {
-            bufferBagless.append(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) {
-        characters(ch, start, length);
-    }
-
-    @Override
-    protected void addMetadata(String value) {
-        if (logger.isTraceEnabled()) {
-            logger.trace("adding " + name + "=" + value);
-        }
-        if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
-            if ((value != null && value.length() > 0) || allowEmptyValues) {
-                if (value == null || value.length() == 0 && allowEmptyValues) {
-                    value = "";
-                }
-                String[] previous = metadata.getValues(name);
-                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
-                    metadata.add(targetProperty, value);
-                }
-            }
-        } else {
-            super.addMetadata(value);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+    /**
+     * Logger for this class
+     */
+    private static final Log logger = LogFactory
+            .getLog(ElementMetadataHandler.class);
+
+    private static final String LOCAL_NAME_RDF_BAG = "Bag";
+    private static final String LOCAL_NAME_RDF_LI = "li";
+    private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    private final String uri;
+
+    private final String localName;
+
+    private final Metadata metadata;
+
+    private final String name;
+    private Property targetProperty;
+
+    private final boolean allowDuplicateValues;
+    private final boolean allowEmptyValues;
+
+    /**
+     * The buffer used to capture characters when inside a bag li element.
+     */
+    private final StringBuilder bufferBagged = new StringBuilder();
+
+    /**
+     * The buffer used to capture characters inside standard elements.
+     */
+    private final StringBuilder bufferBagless = new StringBuilder();
+
+    /**
+     * Whether or not the value was found in a standard element structure or inside a bag.
+     */
+    private boolean isBagless = true;
+
+    private int matchLevel = 0;
+    private int parentMatchLevel = 0;
+
+    /**
+     * Constructor for string metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        if (logger.isTraceEnabled()) {
+            logger.trace("created simple handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for string metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created simple handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for Property metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        if (logger.isTraceEnabled()) {
+            logger.trace("created property handler for " + this.name);
+        }
+    }
+
+    /**
+     * Constructor for Property metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created property handler for " + this.name);
+        }
+    }
+
+    protected boolean isMatchingParentElement(String uri, String localName) {
+        return (uri.equals(this.uri) && localName.equals(this.localName));
+    }
+
+    protected boolean isMatchingElement(String uri, String localName) {
+        // match if we're inside the parent element or within some bag element
+        return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+                (parentMatchLevel > 0 &&
+                        ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+                        (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+                )
+        );
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        if (isMatchingElement(uri, localName)) {
+            matchLevel++;
+        }
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel++;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name) {
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel--;
+        }
+        if (isMatchingElement(uri, localName)) {
+            matchLevel--;
+            if (matchLevel == 2) {
+                // we're inside a bag li element, add the bagged buffer
+                addMetadata(bufferBagged.toString().trim());
+                bufferBagged.setLength(0);
+                isBagless = false;
+            }
+            if (matchLevel == 0 && isBagless) {
+                String valueBagless = bufferBagless.toString();
+                if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+                    // we're in a standard element, add the bagless buffer
+                    addMetadata(valueBagless.trim());
+                    bufferBagless.setLength(0);
+                }
+                isBagless = true;
+            }
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) {
+        // We need to append to both buffers since we don't if we're inside a bag until we're done
+        if (parentMatchLevel > 0 && matchLevel > 2) {
+            bufferBagged.append(ch, start, length);
+        }
+        if (parentMatchLevel > 0 && matchLevel > 0) {
+            bufferBagless.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) {
+        characters(ch, start, length);
+    }
+
+    @Override
+    protected void addMetadata(String value) {
+        if (logger.isTraceEnabled()) {
+            logger.trace("adding " + name + "=" + value);
+        }
+        if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+            if ((value != null && value.length() > 0) || allowEmptyValues) {
+                if (value == null || value.length() == 0 && allowEmptyValues) {
+                    value = "";
+                }
+                String[] previous = metadata.getValues(name);
+                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+                    metadata.add(targetProperty, value);
+                }
+            }
+        } else {
+            super.addMetadata(value);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
index 3c58c9e..e79bbfc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -1,117 +1,117 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Set;
-
-public class FictionBookParser extends XMLParser {
-    private static final long serialVersionUID = 4195954546491524374L;
-    
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return Collections.singleton(MediaType.application("x-fictionbook+xml"));
-    }
-
-    @Override
-    protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
-        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
-        if (ex == null) {
-            ex = new ParsingEmbeddedDocumentExtractor(context);
-        }
-
-        return new BinaryElementsDataHandler(ex, handler);
-    }
-
-    private static class BinaryElementsDataHandler extends DefaultHandler {
-        private static final String ELEMENT_BINARY = "binary";
-
-        private boolean binaryMode = false;
-        private static final String ATTRIBUTE_ID = "id";
-
-        private final EmbeddedDocumentExtractor partExtractor;
-        private final ContentHandler handler;
-        private final StringBuilder binaryData = new StringBuilder();
-        private Metadata metadata;
-        private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
-
-        private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
-            this.partExtractor = partExtractor;
-            this.handler = handler;
-        }
-
-        @Override
-        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
-            binaryMode = ELEMENT_BINARY.equals(localName);
-            if (binaryMode) {
-                binaryData.setLength(0);
-                metadata = new Metadata();
-
-                metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
-                metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
-            }
-        }
-
-        @Override
-        public void endElement(String uri, String localName, String qName) throws SAXException {
-            if (binaryMode) {
-                try {
-                    partExtractor.parseEmbedded(
-                            new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
-                            handler,
-                            metadata,
-                            true
-                    );
-                } catch (IOException e) {
-                    throw new SAXException("IOException in parseEmbedded", e);
-                }
-
-                binaryMode = false;
-                binaryData.setLength(0);
-            }
-        }
-
-        @Override
-        public void characters(char[] ch, int start, int length) throws SAXException {
-            if (!binaryMode) {
-                handler.characters(ch, start, length);
-            } else {
-                binaryData.append(ch, start, length);
-            }
-        }
-
-        @Override
-        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-            handler.ignorableWhitespace(ch, start, length);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+    private static final long serialVersionUID = 4195954546491524374L;
+    
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(MediaType.application("x-fictionbook+xml"));
+    }
+
+    @Override
+    protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex == null) {
+            ex = new ParsingEmbeddedDocumentExtractor(context);
+        }
+
+        return new BinaryElementsDataHandler(ex, handler);
+    }
+
+    private static class BinaryElementsDataHandler extends DefaultHandler {
+        private static final String ELEMENT_BINARY = "binary";
+
+        private boolean binaryMode = false;
+        private static final String ATTRIBUTE_ID = "id";
+
+        private final EmbeddedDocumentExtractor partExtractor;
+        private final ContentHandler handler;
+        private final StringBuilder binaryData = new StringBuilder();
+        private Metadata metadata;
+        private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+        private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+            this.partExtractor = partExtractor;
+            this.handler = handler;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+            binaryMode = ELEMENT_BINARY.equals(localName);
+            if (binaryMode) {
+                binaryData.setLength(0);
+                metadata = new Metadata();
+
+                metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+                metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if (binaryMode) {
+                try {
+                    partExtractor.parseEmbedded(
+                            new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+                            handler,
+                            metadata,
+                            true
+                    );
+                } catch (IOException e) {
+                    throw new SAXException("IOException in parseEmbedded", e);
+                }
+
+                binaryMode = false;
+                binaryData.setLength(0);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (!binaryMode) {
+                handler.characters(ch, start, length);
+            } else {
+                binaryData.append(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+            handler.ignorableWhitespace(ch, start, length);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
index edda097..3fee00a 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
@@ -1,85 +1,85 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * This adds Metadata entries with a specified name for
- *  the textual content of a node (if present), and 
- *  all attribute values passed through the matcher
- *  (but not their names). 
- *
- * @deprecated Use the {@link AttributeMetadataHandler} and
- *             {@link ElementMetadataHandler} classes instead
- */
-public class MetadataHandler extends DefaultHandler {
-
-    private final Metadata metadata;
-
-    private final Property property;
-    private final String name;
-
-    private final StringBuilder buffer = new StringBuilder();
-
-    public MetadataHandler(Metadata metadata, String name) {
-        this.metadata = metadata;
-        this.property = null;
-        this.name = name;
-    }
-    public MetadataHandler(Metadata metadata, Property property) {
-       this.metadata = metadata;
-       this.property = property;
-       this.name = property.getName();
-   }
-
-    public void addMetadata(String value) {
-        if (value.length() > 0) {
-            String previous = metadata.get(name);
-            if (previous != null && previous.length() > 0) {
-                value = previous + ", " + value;
-            }
-            
-            if (this.property != null) {
-               metadata.set(property, value);
-            } else {
-               metadata.set(name, value);
-            }
-        }
-    }
-
-    public void endElement(String uri, String localName, String name) {
-        addMetadata(buffer.toString());
-        buffer.setLength(0);
-    }
-
-    public void startElement(
-            String uri, String localName, String name, Attributes attributes) {
-        for (int i = 0; i < attributes.getLength(); i++) {
-            addMetadata(attributes.getValue(i));
-        }
-    }
-
-    
-    public void characters(char[] ch, int start, int length) {
-        buffer.append(ch, start, length);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ *  the textual content of a node (if present), and 
+ *  all attribute values passed through the matcher
+ *  (but not their names). 
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ *             {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final Property property;
+    private final String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public MetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    public MetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    public void addMetadata(String value) {
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            
+            if (this.property != null) {
+               metadata.set(property, value);
+            } else {
+               metadata.set(name, value);
+            }
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            addMetadata(attributes.getValue(i));
+        }
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
index 6e3d374..b17058d 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
@@ -1,89 +1,89 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.TaggedContentHandler;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * XML parser.
- */
-public class XMLParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -6028836725280212837L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.application("xml"),
-                MediaType.image("svg+xml"))));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
-            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
-        }
-
-        final XHTMLContentHandler xhtml =
-            new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.startElement("p");
-
-        TaggedContentHandler tagged = new TaggedContentHandler(handler);
-        try {
-            context.getSAXParser().parse(
-                    new CloseShieldInputStream(stream),
-                    new OfflineContentHandler(new EmbeddedContentHandler(
-                            getContentHandler(tagged, metadata, context))));
-        } catch (SAXException e) {
-            tagged.throwIfCauseOf(e);
-            throw new TikaException("XML parse error", e);
-        } finally {
-            xhtml.endElement("p");
-            xhtml.endDocument();
-        }
-    }
-
-    protected ContentHandler getContentHandler(
-            ContentHandler handler, Metadata metadata, ParseContext context) {
-        return new TextContentHandler(handler, true);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -6028836725280212837L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("xml"),
+                MediaType.image("svg+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+        }
+
+        final XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.startElement("p");
+
+        TaggedContentHandler tagged = new TaggedContentHandler(handler);
+        try {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            getContentHandler(tagged, metadata, context))));
+        } catch (SAXException e) {
+            tagged.throwIfCauseOf(e);
+            throw new TikaException("XML parse error", e);
+        } finally {
+            xhtml.endElement("p");
+            xhtml.endDocument();
+        }
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TextContentHandler(handler, true);
+    }
+}

[07/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
index 2ccab7b..87f831b 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
@@ -1,1353 +1,1353 @@
-/*
- ****************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
- * others. All Rights Reserved.                                             *
- ************************************************************************** *
- *
- */
-package org.apache.tika.parser.txt;
-
-import java.nio.ByteBuffer;
-
-/**
- * This class recognizes single-byte encodings. Because the encoding scheme is so
- * simple, language statistics are used to do the matching.
- * <p/>
- * The Recognizer works by first mapping from bytes in the encoding under test
- * into that Recognizer's ngram space. Normally this means performing a
- * lowercase, and excluding codepoints that don't correspond to numbers of
- * letters. (Accented letters may or may not be ignored or normalised, depending
- * on the needs of the ngrams)
- * Then, ngram analysis is run against the transformed text, and a confidence
- * is calculated.
- * <p/>
- * For many of our Recognizers, we have one ngram set per language in each
- * encoding, and do a simultanious language+charset detection.
- * <p/>
- * When adding new Recognizers, the easiest way is to byte map to an existing
- * encoding for which we have ngrams, excluding non text, and re-use the ngrams.
- *
- * @internal
- */
-abstract class CharsetRecog_sbcs extends CharsetRecognizer {
-
-    protected boolean haveC1Bytes = false;
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#getName()
-     */
-    abstract String getName();
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
-     */
-    abstract int match(CharsetDetector det);
-
-    int match(CharsetDetector det, int[] ngrams, byte[] byteMap) {
-        return match(det, ngrams, byteMap, (byte) 0x20);
-    }
-
-    int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
-        NGramParser parser = new NGramParser(ngrams, byteMap);
-
-        haveC1Bytes = det.fC1Bytes;
-
-        return parser.parse(det, spaceChar);
-    }
-
-    static class NGramParser {
-        //        private static final int N_GRAM_SIZE = 3;
-        private static final int N_GRAM_MASK = 0xFFFFFF;
-
-        private int byteIndex = 0;
-        private int ngram = 0;
-
-        private int[] ngramList;
-        private byte[] byteMap;
-
-        private int ngramCount;
-        private int hitCount;
-
-        private byte spaceChar;
-
-        public NGramParser(int[] theNgramList, byte[] theByteMap) {
-            ngramList = theNgramList;
-            byteMap = theByteMap;
-
-            ngram = 0;
-
-            ngramCount = hitCount = 0;
-        }
-
-        /*
-         * Binary search for value in table, which must have exactly 64 entries.
-         */
-        private static int search(int[] table, int value) {
-            int index = 0;
-
-            if (table[index + 32] <= value) {
-                index += 32;
-            }
-
-            if (table[index + 16] <= value) {
-                index += 16;
-            }
-
-            if (table[index + 8] <= value) {
-                index += 8;
-            }
-
-            if (table[index + 4] <= value) {
-                index += 4;
-            }
-
-            if (table[index + 2] <= value) {
-                index += 2;
-            }
-
-            if (table[index + 1] <= value) {
-                index += 1;
-            }
-
-            if (table[index] > value) {
-                index -= 1;
-            }
-
-            if (index < 0 || table[index] != value) {
-                return -1;
-            }
-
-            return index;
-        }
-
-        private void lookup(int thisNgram) {
-            ngramCount += 1;
-
-            if (search(ngramList, thisNgram) >= 0) {
-                hitCount += 1;
-            }
-
-        }
-
-        private void addByte(int b) {
-            ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
-            lookup(ngram);
-        }
-
-        private int nextByte(CharsetDetector det) {
-            if (byteIndex >= det.fInputLen) {
-                return -1;
-            }
-
-            return det.fInputBytes[byteIndex++] & 0xFF;
-        }
-
-        public int parse(CharsetDetector det) {
-            return parse(det, (byte) 0x20);
-        }
-
-        public int parse(CharsetDetector det, byte spaceCh) {
-            int b;
-            boolean ignoreSpace = false;
-            this.spaceChar = spaceCh;
-
-            while ((b = nextByte(det)) >= 0) {
-                byte mb = byteMap[b];
-
-                // TODO: 0x20 might not be a space in all character sets...
-                if (mb != 0) {
-                    if (!(mb == spaceChar && ignoreSpace)) {
-                        addByte(mb);
-                    }
-
-                    ignoreSpace = (mb == spaceChar);
-                } else if (mb == 0 && b != 0) {
-                    // Indicates an invalid character in the charset
-                    // Bump the ngram count up a bit to indicate uncertainty
-                    ngramCount += 4;
-                }
-            }
-
-            // TODO: Is this OK? The buffer could have ended in the middle of a word...
-            addByte(spaceChar);
-
-            double rawPercent = (double) hitCount / (double) ngramCount;
-
-//                if (rawPercent <= 2.0) {
-//                    return 0;
-//                }
-
-            // TODO - This is a bit of a hack to take care of a case
-            // were we were getting a confidence of 135...
-            if (rawPercent > 0.33) {
-                return 98;
-            }
-
-            return (int) (rawPercent * 300.0);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-/* 0x00-0x07 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x10-0x17 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x18-0x1f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x20-0x27 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, 
-/* 0x28-0x2f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x30-0x37 */ (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37, 
-/* 0x38-0x3f */ (byte) 0x38, (byte) 0x39, (byte) 0x40, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x40-0x47 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, 
-/* 0x48-0x4f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, 
-/* 0x50-0x57 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, 
-/* 0x58-0x0f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x60-0x67 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, 
-/* 0x68-0x6f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, 
-/* 0x70-0x77 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, 
-/* 0x78-0x7f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x80-0x87 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x88-0x8f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x90-0x97 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x98-0x9f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xa0-0xa7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xa8-0xaf */ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xb0-0xb7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, 
-/* 0xb8-0xbf */ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xc0-0xc7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, 
-/* 0xc8-0xcf */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, 
-/* 0xd0-0xd7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, 
-/* 0xd8-0xdf */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF, 
-/* 0xe0-0xe7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, 
-/* 0xe8-0xef */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, 
-/* 0xf0-0xf7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, 
-/* 0xf8-0xff */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1252" : "ISO-8859-1";
-        }
-    }
-
-    static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
-                0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
-                0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
-                0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
-        };
-
-        public String getLanguage() {
-            return "da";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
-                0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
-                0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
-                0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
-        };
-
-        public String getLanguage() {
-            return "de";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_en extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
-                0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
-                0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
-                0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
-        };
-
-        public String getLanguage() {
-            return "en";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_es extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
-                0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
-                0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
-                0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
-        };
-
-        public String getLanguage() {
-            return "es";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_fr extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
-                0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
-                0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
-                0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
-        };
-
-        public String getLanguage() {
-            return "fr";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_it extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
-                0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
-                0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
-                0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
-        };
-
-        public String getLanguage() {
-            return "it";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
-                0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
-                0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
-                0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
-        };
-
-        public String getLanguage() {
-            return "nl";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_no extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
-                0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
-                0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
-                0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
-        };
-
-        public String getLanguage() {
-            return "no";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
-                0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
-                0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
-                0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
-        };
-
-        public String getLanguage() {
-            return "pt";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_1_sv extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
-                0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
-                0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
-                0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
-        };
-
-        public String getLanguage() {
-            return "sv";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0x20,
-                (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
-                (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
-                (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1250" : "ISO-8859-2";
-        }
-    }
-
-    static class CharsetRecog_8859_2_cs extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
-                0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
-                0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
-                0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
-        };
-
-        public String getLanguage() {
-            return "cs";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_2_hu extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
-                0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
-                0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
-                0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
-        };
-
-        public String getLanguage() {
-            return "hu";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_2_pl extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
-                0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
-                0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
-                0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
-        };
-
-        public String getLanguage() {
-            return "pl";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_2_ro extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
-                0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
-                0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
-                0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
-        };
-
-        public String getLanguage() {
-            return "ro";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
-        };
-
-        public String getName() {
-            return "ISO-8859-5";
-        }
-    }
-
-    static class CharsetRecog_8859_5_ru extends CharsetRecog_8859_5 {
-        private static int[] ngrams = {
-                0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
-                0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
-                0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
-                0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
-        };
-
-        public String getLanguage() {
-            return "ru";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_6 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
-                (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-        };
-
-        public String getName() {
-            return "ISO-8859-6";
-        }
-    }
-
-    static class CharsetRecog_8859_6_ar extends CharsetRecog_8859_6 {
-        private static int[] ngrams = {
-                0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
-                0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
-                0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
-                0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
-        };
-
-        public String getLanguage() {
-            return "ar";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_7 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0xA1, (byte) 0xA2, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xDC, (byte) 0x20,
-                (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, (byte) 0x20, (byte) 0xFC, (byte) 0x20, (byte) 0xFD, (byte) 0xFE,
-                (byte) 0xC0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0x20, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1253" : "ISO-8859-7";
-        }
-    }
-
-    static class CharsetRecog_8859_7_el extends CharsetRecog_8859_7 {
-        private static int[] ngrams = {
-                0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
-                0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
-                0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
-                0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
-        };
-
-        public String getLanguage() {
-            return "el";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_8 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1255" : "ISO-8859-8";
-        }
-    }
-
-    static class CharsetRecog_8859_8_I_he extends CharsetRecog_8859_8 {
-        private static int[] ngrams = {
-                0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
-                0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
-                0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
-                0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1255" : /*"ISO-8859-8-I"*/ "ISO-8859-8";
-        }
-
-        public String getLanguage() {
-            return "he";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8 {
-        private static int[] ngrams = {
-                0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
-                0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
-                0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
-                0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
-        };
-
-        public String getLanguage() {
-            return "he";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_8859_9 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x69, (byte) 0xFE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-        };
-
-        public String getName() {
-            return haveC1Bytes ? "windows-1254" : "ISO-8859-9";
-        }
-    }
-
-    static class CharsetRecog_8859_9_tr extends CharsetRecog_8859_9 {
-        private static int[] ngrams = {
-                0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
-                0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
-                0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
-                0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
-        };
-
-        public String getLanguage() {
-            return "tr";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_windows_1251 extends CharsetRecog_sbcs {
-        private static int[] ngrams = {
-                0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
-                0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
-                0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
-                0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
-        };
-
-        private static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x90, (byte) 0x83, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
-                (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
-                (byte) 0x20, (byte) 0xA2, (byte) 0xA2, (byte) 0xBC, (byte) 0x20, (byte) 0xB4, (byte) 0x20, (byte) 0x20,
-                (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xBF,
-                (byte) 0x20, (byte) 0x20, (byte) 0xB3, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-                (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0xBC, (byte) 0xBE, (byte) 0xBE, (byte) 0xBF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-        };
-
-        public String getName() {
-            return "windows-1251";
-        }
-
-        public String getLanguage() {
-            return "ru";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs {
-        private static int[] ngrams = {
-                0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
-                0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
-                0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
-                0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
-        };
-
-        // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged
-        private static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-        };
-
-        public String getName() {
-            return "IBM866";
-        }
-
-        public String getLanguage() {
-            return "ru";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_windows_1256 extends CharsetRecog_sbcs {
-        private static int[] ngrams = {
-                0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
-                0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
-                0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
-                0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
-        };
-
-        private static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x81, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x88, (byte) 0x20, (byte) 0x8A, (byte) 0x20, (byte) 0x9C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
-                (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x98, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x20, (byte) 0x20, (byte) 0x9F,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
-                (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0x20,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xF4, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0xF9, (byte) 0x20, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0x20, (byte) 0xFF,
-        };
-
-        public String getName() {
-            return "windows-1256";
-        }
-
-        public String getLanguage() {
-            return "ar";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_KOI8_R extends CharsetRecog_sbcs {
-        private static int[] ngrams = {
-                0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
-                0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
-                0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
-                0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
-        };
-
-        private static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
-                (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-                (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
-                (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
-                (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
-                (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-        };
-
-        public String getName() {
-            return "KOI8-R";
-        }
-
-        public String getLanguage() {
-            return "ru";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-/*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
-/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 4- */    (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 5- */    (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 6- */    (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 7- */    (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40, 
-/* 8- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 9- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* A- */    (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* B- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* C- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* D- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* E- */    (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* F- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-        };
-
-        public String getLanguage() {
-            return "he";
-        }
-    }
-
-    static class CharsetRecog_IBM424_he_rtl extends CharsetRecog_IBM424_he {
-        private static int[] ngrams = {
-                0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
-                0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
-                0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
-                0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
-        };
-
-        public String getName() {
-            return "IBM424_rtl";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap, (byte) 0x40);
-        }
-    }
-
-    static class CharsetRecog_IBM424_he_ltr extends CharsetRecog_IBM424_he {
-        private static int[] ngrams = {
-                0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
-                0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
-                0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
-                0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651
-
-        };
-
-        public String getName() {
-            return "IBM424_ltr";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap, (byte) 0x40);
-        }
-    }
-
-    abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-/*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
-/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */    (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 5- */    (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x40, (byte) 0x40, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 6- */    (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 7- */    (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x78, (byte) 0x79, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 8- */    (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
-/* 9- */    (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
-/* A- */    (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
-/* B- */    (byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x40, (byte) 0x40, (byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
-/* C- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0xCB, (byte) 0x40, (byte) 0xCD, (byte) 0x40, (byte) 0xCF,
-/* D- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-/* E- */    (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* F- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40,
-        };
-        protected static byte[] unshapeMap = {
-/*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
-/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */    (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
-/* 5- */    (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
-/* 6- */    (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 7- */    (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte)

<TRUNCATED>

[05/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 6d1c99a..9d9a138 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -1,274 +1,274 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.txt;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.ByteArrayInputStream;
-import java.io.StringWriter;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TXTParserTest extends TikaTest {
-
-    private Parser parser = new TXTParser();
-
-    @Test
-    public void testEnglishText() throws Exception {
-        String text =
-                "Hello, World! This is simple UTF-8 text content written"
-                        + " in English to test autodetection of both the character"
-                        + " encoding and the language of the input stream.";
-
-        Metadata metadata = new Metadata();
-        StringWriter writer = new StringWriter();
-        parser.parse(
-                new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
-                new WriteOutContentHandler(writer),
-                metadata,
-                new ParseContext());
-        String content = writer.toString();
-
-        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
-        // TIKA-501: Remove language detection from TXTParser
-        assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
-        assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
-
-        assertContains("Hello", content);
-        assertContains("World", content);
-        assertContains("autodetection", content);
-        assertContains("stream", content);
-    }
-
-    @Test
-    public void testUTF8Text() throws Exception {
-        String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
-
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(text.getBytes(UTF_8)),
-                handler, metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
-        assertContains(text, handler.toString());
-    }
-
-    @Test
-    public void testEmptyText() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("\n", handler.toString());
-    }
-
-    /**
-     * Test for the heuristics that we use to assign an eight-bit character
-     * encoding to mostly ASCII sequences. If a more specific match can not
-     * be made, a string with a CR(LF) in it is most probably windows-1252,
-     * otherwise ISO-8859-1, except if it contains the currency/euro symbol
-     * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
-     */
-    @Test
-    public void testLatinDetectionHeuristics() throws Exception {
-        String windows = "test\r\n";
-        String unix = "test\n";
-        String euro = "test \u20ac\n";
-
-        Metadata metadata;
-
-        metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
-                new DefaultHandler(), metadata, new ParseContext());
-        assertEquals(
-                "text/plain; charset=windows-1252",
-                metadata.get(Metadata.CONTENT_TYPE));
-
-        metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
-                new DefaultHandler(), metadata, new ParseContext());
-        assertEquals(
-                "text/plain; charset=ISO-8859-1",
-                metadata.get(Metadata.CONTENT_TYPE));
-
-        metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
-                new DefaultHandler(), metadata, new ParseContext());
-        assertEquals(
-                "text/plain; charset=ISO-8859-15",
-                metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-    /**
-     * Test case for TIKA-240: Drop the BOM when extracting plain text
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
-     */
-    @Test
-    public void testDropByteOrderMark() throws Exception {
-        assertExtractText("UTF-8 BOM", "test", new byte[]{
-                (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
-        assertExtractText("UTF-16 BE BOM", "test", new byte[]{
-                (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
-        assertExtractText("UTF-16 LE BOM", "test", new byte[]{
-                (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
-    }
-
-    /**
-     * Test case for TIKA-335: using incoming charset
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
-     */
-    @Test
-    public void testUseIncomingCharsetAsHint() throws Exception {
-        // Could be ISO 8859-1 or ISO 8859-15 or ...
-        // u00e1 is latin small letter a with acute
-        final String test2 = "the name is \u00e1ndre";
-
-        Metadata metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
-        metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
-        parser.parse(
-                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-    }
-
-    /**
-     * Test case for TIKA-341: using charset in content-type
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
-     */
-    @Test
-    public void testUsingCharsetInContentTypeHeader() throws Exception {
-        // Could be ISO 8859-1 or ISO 8859-15 or ...
-        // u00e1 is latin small letter a with acute
-        final String test2 = "the name is \u00e1ndre";
-
-        Metadata metadata = new Metadata();
-        parser.parse(
-                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
-        metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
-        parser.parse(
-                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
-                new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-    }
-
-    private void assertExtractText(String msg, String expected, byte[] input)
-            throws Exception {
-        ContentHandler handler = new BodyContentHandler() {
-            public void ignorableWhitespace(char[] ch, int off, int len) {
-                // Ignore the whitespace added by XHTMLContentHandler
-            }
-        };
-        Metadata metadata = new Metadata();
-        parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
-        assertEquals(msg, expected, handler.toString());
-    }
-
-    /**
-     * Test case for TIKA-339: don't override incoming language
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
-     */
-    @Test
-    public void testRetainIncomingLanguage() throws Exception {
-        final String test = "Simple Content";
-
-        Metadata metadata = new Metadata();
-        metadata.set(TikaCoreProperties.LANGUAGE, "en");
-
-        parser.parse(
-                new ByteArrayInputStream(test.getBytes(UTF_8)),
-                new BodyContentHandler(), metadata, new ParseContext());
-
-        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
-    }
-
-    @Test
-    public void testCP866() throws Exception {
-        XMLResult r = getXML("russian.cp866.txt", parser);
-        assertEquals("text/plain; charset=IBM866", r.metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testEBCDIC_CP500() throws Exception {
-        XMLResult r = getXML("english.cp500.txt", parser);
-        assertEquals("text/plain; charset=IBM500", r.metadata.get(Metadata.CONTENT_TYPE));
-
-        // Additional check that it isn't too eager on short blocks of text
-        r = getXML(
-                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
-                parser, new Metadata());
-
-        assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-    /**
-     * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
-     */
-    @Test
-    public void testCharsetDetectionWithShortSnipet() throws Exception {
-        final String text = "Hello, World!";
-        XMLResult r = getXML(
-                new ByteArrayInputStream(text.getBytes(UTF_8)), parser, new Metadata());
-        assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
-
-        // Now verify that if we tell the parser the encoding is UTF-8, that's what
-        // we get back (see TIKA-868)
-        r.metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
-        parser.parse(
-                new ByteArrayInputStream(text.getBytes(UTF_8)),
-                new BodyContentHandler(), r.metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TXTParserTest extends TikaTest {
+
+    private Parser parser = new TXTParser();
+
+    @Test
+    public void testEnglishText() throws Exception {
+        String text =
+                "Hello, World! This is simple UTF-8 text content written"
+                        + " in English to test autodetection of both the character"
+                        + " encoding and the language of the input stream.";
+
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+        String content = writer.toString();
+
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+        // TIKA-501: Remove language detection from TXTParser
+        assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
+        assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
+
+        assertContains("Hello", content);
+        assertContains("World", content);
+        assertContains("autodetection", content);
+        assertContains("stream", content);
+    }
+
+    @Test
+    public void testUTF8Text() throws Exception {
+        String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(UTF_8)),
+                handler, metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        assertContains(text, handler.toString());
+    }
+
+    @Test
+    public void testEmptyText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("\n", handler.toString());
+    }
+
+    /**
+     * Test for the heuristics that we use to assign an eight-bit character
+     * encoding to mostly ASCII sequences. If a more specific match can not
+     * be made, a string with a CR(LF) in it is most probably windows-1252,
+     * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+     * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+     */
+    @Test
+    public void testLatinDetectionHeuristics() throws Exception {
+        String windows = "test\r\n";
+        String unix = "test\n";
+        String euro = "test \u20ac\n";
+
+        Metadata metadata;
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=windows-1252",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-1",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-15",
+                metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    /**
+     * Test case for TIKA-240: Drop the BOM when extracting plain text
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
+     */
+    @Test
+    public void testDropByteOrderMark() throws Exception {
+        assertExtractText("UTF-8 BOM", "test", new byte[]{
+                (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
+        assertExtractText("UTF-16 BE BOM", "test", new byte[]{
+                (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
+        assertExtractText("UTF-16 LE BOM", "test", new byte[]{
+                (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
+    }
+
+    /**
+     * Test case for TIKA-335: using incoming charset
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+     */
+    @Test
+    public void testUseIncomingCharsetAsHint() throws Exception {
+        // Could be ISO 8859-1 or ISO 8859-15 or ...
+        // u00e1 is latin small letter a with acute
+        final String test2 = "the name is \u00e1ndre";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+    }
+
+    /**
+     * Test case for TIKA-341: using charset in content-type
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+     */
+    @Test
+    public void testUsingCharsetInContentTypeHeader() throws Exception {
+        // Could be ISO 8859-1 or ISO 8859-15 or ...
+        // u00e1 is latin small letter a with acute
+        final String test2 = "the name is \u00e1ndre";
+
+        Metadata metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
+        parser.parse(
+                new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+                new BodyContentHandler(), metadata, new ParseContext());
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+    }
+
+    private void assertExtractText(String msg, String expected, byte[] input)
+            throws Exception {
+        ContentHandler handler = new BodyContentHandler() {
+            public void ignorableWhitespace(char[] ch, int off, int len) {
+                // Ignore the whitespace added by XHTMLContentHandler
+            }
+        };
+        Metadata metadata = new Metadata();
+        parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
+        assertEquals(msg, expected, handler.toString());
+    }
+
+    /**
+     * Test case for TIKA-339: don't override incoming language
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+     */
+    @Test
+    public void testRetainIncomingLanguage() throws Exception {
+        final String test = "Simple Content";
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.LANGUAGE, "en");
+
+        parser.parse(
+                new ByteArrayInputStream(test.getBytes(UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+    }
+
+    @Test
+    public void testCP866() throws Exception {
+        XMLResult r = getXML("russian.cp866.txt", parser);
+        assertEquals("text/plain; charset=IBM866", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testEBCDIC_CP500() throws Exception {
+        XMLResult r = getXML("english.cp500.txt", parser);
+        assertEquals("text/plain; charset=IBM500", r.metadata.get(Metadata.CONTENT_TYPE));
+
+        // Additional check that it isn't too eager on short blocks of text
+        r = getXML(
+                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
+                parser, new Metadata());
+
+        assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    /**
+     * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+     */
+    @Test
+    public void testCharsetDetectionWithShortSnipet() throws Exception {
+        final String text = "Hello, World!";
+        XMLResult r = getXML(
+                new ByteArrayInputStream(text.getBytes(UTF_8)), parser, new Metadata());
+        assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+
+        // Now verify that if we tell the parser the encoding is UTF-8, that's what
+        // we get back (see TIKA-868)
+        r.metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+        parser.parse(
+                new ByteArrayInputStream(text.getBytes(UTF_8)),
+                new BodyContentHandler(), r.metadata, new ParseContext());
+        assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
index 665151d..2458963 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
@@ -1,87 +1,87 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class DcXMLParserTest extends TikaTest {
-
-    @Test
-    public void testXMLParserAsciiChars() throws Exception {
-        XMLResult result = getXML("testXML.xml", new DcXMLParser());
-        Metadata metadata = result.metadata;
-            assertEquals(
-                    "application/xml",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
-
-            // The file contains 5 dc:subject tags, which come through as
-            //  a multi-valued Tika Metadata entry in file order
-            assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
-            assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
-            assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
-            assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
-            assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
-            assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
-            assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
-            assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
-            assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
-            assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
-            assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
-            assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
-            assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
-            assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
-
-            assertEquals(
-                    "Framework d\'indexation des documents XML, HTML, PDF etc..",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals(
-                    "http://www.apache.org",
-                    metadata.get(TikaCoreProperties.IDENTIFIER));
-            assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
-            assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
-            assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
-            assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
-
-            assertContains("Tika test document", result.xml);
-
-            assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
-
-    }
-    
-    @Test
-    public void testXMLParserNonAsciiChars() throws Exception {
-        XMLResult r = getXML("testXML.xml", new DcXMLParser());
-        final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
-        assertEquals(expected, r.metadata.get(TikaCoreProperties.RIGHTS));
-    }
-
-    // TIKA-1048
-    @Test
-    public void testNoSpaces() throws Exception {
-      String text = getXML("testXML2.xml").xml;
-      assertFalse(text.contains("testSubject"));
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class DcXMLParserTest extends TikaTest {
+
+    @Test
+    public void testXMLParserAsciiChars() throws Exception {
+        XMLResult result = getXML("testXML.xml", new DcXMLParser());
+        Metadata metadata = result.metadata;
+            assertEquals(
+                    "application/xml",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+            // The file contains 5 dc:subject tags, which come through as
+            //  a multi-valued Tika Metadata entry in file order
+            assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+            assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+            assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+            assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+            assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+            assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+            assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+            assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+            assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+            assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+            assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+            assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+            assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+            assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+            assertEquals(
+                    "Framework d\'indexation des documents XML, HTML, PDF etc..",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals(
+                    "http://www.apache.org",
+                    metadata.get(TikaCoreProperties.IDENTIFIER));
+            assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+            assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+            assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+            assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+            assertContains("Tika test document", result.xml);
+
+            assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+
+    }
+    
+    @Test
+    public void testXMLParserNonAsciiChars() throws Exception {
+        XMLResult r = getXML("testXML.xml", new DcXMLParser());
+        final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+        assertEquals(expected, r.metadata.get(TikaCoreProperties.RIGHTS));
+    }
+
+    // TIKA-1048
+    @Test
+    public void testNoSpaces() throws Exception {
+      String text = getXML("testXML2.xml").xml;
+      assertFalse(text.contains("testSubject"));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
index 536f9d7..39e15d3 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
@@ -1,116 +1,116 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.TeeContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
-    
-    private Property FIRST_NAME = Property.internalTextBag(
-            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
-    private Property LAST_NAME = Property.internalTextBag(
-            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
-
-    @Test
-    public void testDefaultBehavior() throws Exception {
-        XMLResult r = getXML("testXML3.xml", new DefaultCustomXMLTestParser());
-        Metadata metadata = r.metadata;
-
-        assertEquals(4, metadata.getValues(FIRST_NAME).length);
-        assertEquals(2, metadata.getValues(LAST_NAME).length);
-
-        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
-        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
-
-        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
-        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
-
-        // We didn't know Bob's last name, but now we don't know an entry existed
-        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
-
-        // We don't know Kate's last name because it was a duplicate
-        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
-    }
-    
-    @Test
-    public void testEmptiesAndRepeats() throws Exception {
-        XMLResult r = getXML("testXML3.xml", new AllowEmptiesAndDuplicatesCustomXMLTestParser());
-        Metadata metadata = r.metadata;
-
-        assertEquals(4, metadata.getValues(FIRST_NAME).length);
-        assertEquals(4, metadata.getValues(LAST_NAME).length);
-
-        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
-        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
-
-        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
-        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
-
-        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
-        assertEquals("", metadata.getValues(LAST_NAME)[2]);
-
-        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
-        assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
-
-    }
-    
-    private class DefaultCustomXMLTestParser extends XMLParser {
-    
-        private static final long serialVersionUID = 2458579047014545931L;
-
-        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
-            return new ElementMetadataHandler(
-                    "http://custom",
-                    localPart,
-                    metadata,
-                    tikaProperty);
-        }
-        
-        protected ContentHandler getContentHandler(
-                ContentHandler handler, Metadata metadata, ParseContext context) {
-            return new TeeContentHandler(
-                    super.getContentHandler(handler, metadata, context),
-                    getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
-                    getCustomElementHandler(metadata, LAST_NAME, "LastName"));
-        }
-    }
-    
-    private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
-        
-        private static final long serialVersionUID = 3735646809954466229L;
-
-        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
-            return new ElementMetadataHandler(
-                    "http://custom",
-                    localPart,
-                    metadata,
-                    tikaProperty,
-                    true,
-                    true);
-        }
-    }
-    
-    
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+    
+    private Property FIRST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+    private Property LAST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+    @Test
+    public void testDefaultBehavior() throws Exception {
+        XMLResult r = getXML("testXML3.xml", new DefaultCustomXMLTestParser());
+        Metadata metadata = r.metadata;
+
+        assertEquals(4, metadata.getValues(FIRST_NAME).length);
+        assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+        // We didn't know Bob's last name, but now we don't know an entry existed
+        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+        // We don't know Kate's last name because it was a duplicate
+        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+    }
+    
+    @Test
+    public void testEmptiesAndRepeats() throws Exception {
+        XMLResult r = getXML("testXML3.xml", new AllowEmptiesAndDuplicatesCustomXMLTestParser());
+        Metadata metadata = r.metadata;
+
+        assertEquals(4, metadata.getValues(FIRST_NAME).length);
+        assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+        assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+        assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+
+    }
+    
+    private class DefaultCustomXMLTestParser extends XMLParser {
+    
+        private static final long serialVersionUID = 2458579047014545931L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty);
+        }
+        
+        protected ContentHandler getContentHandler(
+                ContentHandler handler, Metadata metadata, ParseContext context) {
+            return new TeeContentHandler(
+                    super.getContentHandler(handler, metadata, context),
+                    getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+                    getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+        }
+    }
+    
+    private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+        
+        private static final long serialVersionUID = 3735646809954466229L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty,
+                    true,
+                    true);
+        }
+    }
+    
+    
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
index aee7307..8ee966c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Test;
-
-public class FictionBookParserTest extends TikaTest {
-  
-    @Test
-    public void testFB2() throws Exception {
-        XMLResult r = getXML("test.fb2", new FictionBookParser(), new Metadata(), new ParseContext());
-        assertContains("1812", r.xml);
-    }
-
-    @Test
-    public void testEmbedded() throws Exception {
-        try (InputStream input = getTestDocumentAsStream("test.fb2")) {
-            ContainerExtractor extractor = new ParserContainerExtractor();
-            TikaInputStream stream = TikaInputStream.get(input);
-
-            assertEquals(true, extractor.isSupported(stream));
-
-            // Process it
-            TrackingHandler handler = new TrackingHandler();
-            extractor.extract(stream, null, handler);
-
-            assertEquals(2, handler.filenames.size());
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class FictionBookParserTest extends TikaTest {
+  
+    @Test
+    public void testFB2() throws Exception {
+        XMLResult r = getXML("test.fb2", new FictionBookParser(), new Metadata(), new ParseContext());
+        assertContains("1812", r.xml);
+    }
+
+    @Test
+    public void testEmbedded() throws Exception {
+        try (InputStream input = getTestDocumentAsStream("test.fb2")) {
+            ContainerExtractor extractor = new ParserContainerExtractor();
+            TikaInputStream stream = TikaInputStream.get(input);
+
+            assertEquals(true, extractor.isSupported(stream));
+
+            // Process it
+            TrackingHandler handler = new TrackingHandler();
+            extractor.extract(stream, null, handler);
+
+            assertEquals(2, handler.filenames.size());
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/pom.xml b/tika-parser-modules/tika-parser-web-module/pom.xml
index 53aadb2..ee9e24c 100644
--- a/tika-parser-modules/tika-parser-web-module/pom.xml
+++ b/tika-parser-modules/tika-parser-web-module/pom.xml
@@ -1,89 +1,89 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-web-module</artifactId>
-  <name>Apache Tika parser web module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <mime4j.version>0.7.2</mime4j.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.ccil.cowan.tagsoup</groupId>
-      <artifactId>tagsoup</artifactId>
-      <version>1.2.1</version>
-    </dependency>
-    <dependency>
-      <groupId>de.l3s.boilerpipe</groupId>
-      <artifactId>boilerpipe</artifactId>
-      <version>1.1.0</version>
-    </dependency>
-    <dependency>
-      <groupId>com.rometools</groupId>
-      <artifactId>rome</artifactId>
-      <version>1.5.1</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.james</groupId>
-      <artifactId>apache-mime4j-core</artifactId>
-      <version>${mime4j.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.james</groupId>
-      <artifactId>apache-mime4j-dom</artifactId>
-      <version>${mime4j.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-web-module</artifactId>
+  <name>Apache Tika parser web module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <mime4j.version>0.7.2</mime4j.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
+      <version>1.2.1</version>
+    </dependency>
+    <dependency>
+      <groupId>de.l3s.boilerpipe</groupId>
+      <artifactId>boilerpipe</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>com.rometools</groupId>
+      <artifactId>rome</artifactId>
+      <version>1.5.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j-core</artifactId>
+      <version>${mime4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j-dom</artifactId>
+      <version>${mime4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
index 53e28ca..4c728cc 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.web.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.web.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
index b69e677..428ff83 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
@@ -1,127 +1,127 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.feed;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import com.rometools.rome.feed.synd.SyndContent;
-import com.rometools.rome.feed.synd.SyndEntry;
-import com.rometools.rome.feed.synd.SyndFeed;
-import com.rometools.rome.io.FeedException;
-import com.rometools.rome.io.SyndFeedInput;
-
-/**
- * Feed parser.
- * <p>
- * Uses Rome for parsing the feeds. A feed description is put in a paragraph
- * with its link and title in an anchor.
- */
-public class FeedParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -3785361933034525186L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.application("rss+xml"),
-                    MediaType.application("atom+xml"))));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // set the encoding?
-        try {
-            SyndFeed feed = new SyndFeedInput().build(
-                    new InputSource(new CloseShieldInputStream(stream)));
-
-            String title = stripTags(feed.getTitleEx());
-            String description = stripTags(feed.getDescriptionEx());
-
-            metadata.set(TikaCoreProperties.TITLE, title);
-            metadata.set(TikaCoreProperties.DESCRIPTION, description);
-            // store the other fields in the metadata
-
-            XHTMLContentHandler xhtml =
-                new XHTMLContentHandler(handler, metadata);
-            xhtml.startDocument();
-
-            xhtml.element("h1", title);
-            xhtml.element("p", description);
-
-            xhtml.startElement("ul");
-            for (Object e : feed.getEntries()) {
-                SyndEntry entry = (SyndEntry) e;
-                String link = entry.getLink();
-                if (link != null) {
-                    xhtml.startElement("li");
-                    xhtml.startElement("a", "href", link);
-                    xhtml.characters(stripTags(entry.getTitleEx()));
-                    xhtml.endElement("a");
-                    SyndContent content = entry.getDescription();
-                    if (content != null) {
-                        xhtml.newline();
-                        xhtml.characters(stripTags(content));
-                    }
-                    xhtml.endElement("li");
-                }
-            }
-            xhtml.endElement("ul");
-
-            xhtml.endDocument();
-        } catch (FeedException e) {
-            throw new TikaException("RSS parse error", e);
-        }
-
-    }
-
-    private static String stripTags(SyndContent c) {
-        if (c == null)
-            return "";
-
-        String value = c.getValue();
-
-        String[] parts = value.split("<[^>]*>");
-        StringBuffer buf = new StringBuffer();
-
-        for (String part : parts)
-            buf.append(part);
-
-        return buf.toString().trim();
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.rometools.rome.feed.synd.SyndContent;
+import com.rometools.rome.feed.synd.SyndEntry;
+import com.rometools.rome.feed.synd.SyndFeed;
+import com.rometools.rome.io.FeedException;
+import com.rometools.rome.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -3785361933034525186L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("rss+xml"),
+                    MediaType.application("atom+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // set the encoding?
+        try {
+            SyndFeed feed = new SyndFeedInput().build(
+                    new InputSource(new CloseShieldInputStream(stream)));
+
+            String title = stripTags(feed.getTitleEx());
+            String description = stripTags(feed.getDescriptionEx());
+
+            metadata.set(TikaCoreProperties.TITLE, title);
+            metadata.set(TikaCoreProperties.DESCRIPTION, description);
+            // store the other fields in the metadata
+
+            XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+            xhtml.element("h1", title);
+            xhtml.element("p", description);
+
+            xhtml.startElement("ul");
+            for (Object e : feed.getEntries()) {
+                SyndEntry entry = (SyndEntry) e;
+                String link = entry.getLink();
+                if (link != null) {
+                    xhtml.startElement("li");
+                    xhtml.startElement("a", "href", link);
+                    xhtml.characters(stripTags(entry.getTitleEx()));
+                    xhtml.endElement("a");
+                    SyndContent content = entry.getDescription();
+                    if (content != null) {
+                        xhtml.newline();
+                        xhtml.characters(stripTags(content));
+                    }
+                    xhtml.endElement("li");
+                }
+            }
+            xhtml.endElement("ul");
+
+            xhtml.endDocument();
+        } catch (FeedException e) {
+            throw new TikaException("RSS parse error", e);
+        }
+
+    }
+
+    private static String stripTags(SyndContent c) {
+        if (c == null)
+            return "";
+
+        String value = c.getValue();
+
+        String[] parts = value.split("<[^>]*>");
+        StringBuffer buf = new StringBuffer();
+
+        for (String part : parts)
+            buf.append(part);
+
+        return buf.toString().trim();
+    }
+}

[20/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index cf92406..6c86765 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -1,1423 +1,1423 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.CodingErrorAction;
-import java.util.Calendar;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.CharsetUtils;
-import org.xml.sax.SAXException;
-
-/* Tokenizes and performs a "shallow" parse of the RTF
- * document, just enough to properly decode the text.
- *
- * TODO: we should cutover to a "real" tokenizer (eg JFlex);
- * it should give better perf, by replacing the excessive
- * "else if" string compares with FSA traversal. */
-
-final class TextExtractor {
-
-    private static final Charset ASCII = Charset.forName("US-ASCII");
-    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
-    private static final Charset MAC_ROMAN = getCharset("MacRoman");
-    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
-    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
-    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
-    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
-    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
-    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
-    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
-    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
-    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
-    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
-    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
-    private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
-    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
-    private static final Charset X_JOHAB = getCharset("x-Johab");
-    private static final Charset CP12582 = getCharset("CP1258");
-    private static final Charset CP12572 = getCharset("CP1257");
-    private static final Charset CP12562 = getCharset("CP1256");
-    private static final Charset CP12552 = getCharset("CP1255");
-    private static final Charset CP12542 = getCharset("CP1254");
-    private static final Charset CP12532 = getCharset("CP1253");
-    private static final Charset CP1252 = getCharset("CP1252");
-    private static final Charset CP12512 = getCharset("CP1251");
-    private static final Charset CP12502 = getCharset("CP1250");
-    private static final Charset CP950 = getCharset("CP950");
-    private static final Charset CP949 = getCharset("CP949");
-    private static final Charset MS9362 = getCharset("MS936");
-    private static final Charset MS8742 = getCharset("MS874");
-    private static final Charset CP866 = getCharset("CP866");
-    private static final Charset CP865 = getCharset("CP865");
-    private static final Charset CP864 = getCharset("CP864");
-    private static final Charset CP863 = getCharset("CP863");
-    private static final Charset CP862 = getCharset("CP862");
-    private static final Charset CP860 = getCharset("CP860");
-    private static final Charset CP852 = getCharset("CP852");
-    private static final Charset CP8502 = getCharset("CP850");
-    private static final Charset CP819 = getCharset("CP819");
-    private static final Charset WINDOWS_720 = getCharset("windows-720");
-    private static final Charset WINDOWS_711 = getCharset("windows-711");
-    private static final Charset WINDOWS_710 = getCharset("windows-710");
-    private static final Charset WINDOWS_709 = getCharset("windows-709");
-    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
-    private static final Charset CP4372 = getCharset("CP437");
-    private static final Charset CP850 = getCharset("cp850");
-    private static final Charset CP437 = getCharset("cp437");
-    private static final Charset MS874 = getCharset("ms874");
-    private static final Charset CP1257 = getCharset("cp1257");
-    private static final Charset CP1256 = getCharset("cp1256");
-    private static final Charset CP1255 = getCharset("cp1255");
-    private static final Charset CP1258 = getCharset("cp1258");
-    private static final Charset CP1254 = getCharset("cp1254");
-    private static final Charset CP1253 = getCharset("cp1253");
-    private static final Charset MS950 = getCharset("ms950");
-    private static final Charset MS936 = getCharset("ms936");
-    private static final Charset MS1361 = getCharset("ms1361");
-    private static final Charset MS932 = getCharset("MS932");
-    private static final Charset CP1251 = getCharset("cp1251");
-    private static final Charset CP1250 = getCharset("cp1250");
-    private static final Charset MAC_THAI = getCharset("MacThai");
-    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
-    private static final Charset MAC_GREEK = getCharset("MacGreek");
-    private static final Charset MAC_ARABIC = getCharset("MacArabic");
-    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
-    private static final Charset JOHAB = getCharset("johab");
-    private static final Charset BIG5 = getCharset("Big5");
-    private static final Charset GB2312 = getCharset("GB2312");
-    private static final Charset MS949 = getCharset("ms949");
-    // The RTF doc has a "font table" that assigns ords
-    // (f0, f1, f2, etc.) to fonts and charsets, using the
-    // \fcharsetN control word.  This mapping maps from the
-    // N to corresponding Java charset:
-    private static final Map<Integer, Charset> FCHARSET_MAP =
-            new HashMap<Integer, Charset>();
-    // The RTF may specify the \ansicpgN charset in the
-    // header; this maps the N to the corresponding Java
-    // character set:
-    private static final Map<Integer, Charset> ANSICPG_MAP =
-            new HashMap<Integer, Charset>();
-
-    static {
-        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
-        // charset 1 is Default
-        // charset 2 is Symbol
-
-        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
-        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
-        FCHARSET_MAP.put(79, MS949); // Mac Hangul
-        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
-        FCHARSET_MAP.put(81, BIG5); // Mac Big5
-        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
-        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
-        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
-        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
-        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
-        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
-        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
-        FCHARSET_MAP.put(89, CP1251); // Mac Russian
-
-        FCHARSET_MAP.put(128, MS932); // Shift JIS
-        FCHARSET_MAP.put(129, MS949); // Hangul
-        FCHARSET_MAP.put(130, MS1361); // Johab
-        FCHARSET_MAP.put(134, MS936); // GB2312
-        FCHARSET_MAP.put(136, MS950); // Big5
-        FCHARSET_MAP.put(161, CP1253); // Greek
-        FCHARSET_MAP.put(162, CP1254); // Turkish
-        FCHARSET_MAP.put(163, CP1258); // Vietnamese
-        FCHARSET_MAP.put(177, CP1255); // Hebrew
-        FCHARSET_MAP.put(178, CP1256); // Arabic
-        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
-        // FCHARSET_MAP.put( 180, "" ); // Arabic user
-        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
-        FCHARSET_MAP.put(186, CP1257); // Baltic
-
-        FCHARSET_MAP.put(204, CP1251); // Russian
-        FCHARSET_MAP.put(222, MS874); // Thai
-        FCHARSET_MAP.put(238, CP1250); // Eastern European
-        FCHARSET_MAP.put(254, CP437); // PC 437
-        FCHARSET_MAP.put(255, CP850); // OEM
-    }
-
-    static {
-        ANSICPG_MAP.put(437, CP4372);   // US IBM
-        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
-
-        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
-        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
-        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
-        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
-        ANSICPG_MAP.put(852, CP852);  // Eastern European
-        ANSICPG_MAP.put(860, CP860);  // Portuguese
-        ANSICPG_MAP.put(862, CP862);  // Hebrew
-        ANSICPG_MAP.put(863, CP863);  // French Canadian
-        ANSICPG_MAP.put(864, CP864);  // Arabic
-        ANSICPG_MAP.put(865, CP865);  // Norwegian
-        ANSICPG_MAP.put(866, CP866);  // Soviet Union
-        ANSICPG_MAP.put(874, MS8742);  // Thai
-        ANSICPG_MAP.put(932, MS932);  // Japanese
-        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
-        ANSICPG_MAP.put(949, CP949);  // Korean
-        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
-        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
-        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
-        ANSICPG_MAP.put(1252, CP1252);  // Western European
-        ANSICPG_MAP.put(1253, CP12532);  // Greek
-        ANSICPG_MAP.put(1254, CP12542);  // Turkish
-        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
-        ANSICPG_MAP.put(1256, CP12562);  // Arabic
-        ANSICPG_MAP.put(1257, CP12572);  // Baltic
-        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
-        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
-        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
-        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
-        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
-        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
-        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
-        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
-        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
-        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
-        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari
-
-        // TODO: in theory these other charsets are simple
-        // shifts off of Devanagari, so we could impl that
-        // here:
-        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
-        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
-        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
-        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
-        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
-        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
-        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
-        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
-        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
-    }
-
-    // Used when we decode bytes -> chars using CharsetDecoder:
-    private final char[] outputArray = new char[128];
-    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
-    // Holds the font table from this RTF doc, mapping
-    // the font number (from \fN control word) to the
-    // corresponding charset:
-    private final Map<Integer, Charset> fontToCharset =
-            new HashMap<Integer, Charset>();
-    // Group stack: when we open a new group, we push
-    // the previous group state onto the stack; when we
-    // close the group, we restore it
-    private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
-    private final StringBuilder pendingBuffer = new StringBuilder();
-    private final XHTMLContentHandler out;
-    private final Metadata metadata;
-    private final RTFEmbObjHandler embObjHandler;
-    // How many next ansi chars we should skip; this
-    // is 0 except when we are still in the "ansi
-    // shadow" after seeing a unicode escape, at which
-    // point it's set to the last ucN skip we had seen:
-    int ansiSkip = 0;
-    private int written = 0;
-    // Hold pending bytes (encoded in the current charset)
-    // for text output:
-    private byte[] pendingBytes = new byte[16];
-    private int pendingByteCount;
-    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-    // Holds pending chars for text output
-    private char[] pendingChars = new char[10];
-    private int pendingCharCount;
-    // Holds chars for a still-being-tokenized control word
-    private byte[] pendingControl = new byte[10];
-    private int pendingControlCount;
-    // Reused when possible:
-    private CharsetDecoder decoder;
-    private Charset lastCharset;
-    private Charset globalCharset = WINDOWS_1252;
-    private int globalDefaultFont = -1;
-    private int curFontID = -1;
-    // Current group state; in theory this initial
-    // GroupState is unused because the RTF doc should
-    // immediately open the top group (start with {):
-    private GroupState groupState = new GroupState();
-    private boolean inHeader = true;
-    private int fontTableState;
-    private int fontTableDepth;
-    // Non null if we are processing metadata (title,
-    // keywords, etc.) inside the info group:
-    private Property nextMetaData;
-    private boolean inParagraph;
-    // Non-zero if we are processing inside a field destination:
-    private int fieldState;
-    // Non-zero list index
-    private int pendingListEnd;
-    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
-    private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
-    private Map<Integer, ListDescriptor> currentListTable;
-    private ListDescriptor currentList;
-    private int listTableLevel = -1;
-    private boolean ignoreLists;
-    // Non-null if we've seen the url for a HYPERLINK but not yet
-    // its text:
-    private String pendingURL;
-    // Used to process the sub-groups inside the upr
-    // group:
-    private int uprState = -1;
-    // Used when extracting CREATION date:
-    private int year, month, day, hour, minute;
-
-    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
-                         RTFEmbObjHandler embObjHandler) {
-        this.metadata = metadata;
-        this.out = out;
-        this.embObjHandler = embObjHandler;
-    }
-
-    private static Charset getCharset(String name) {
-        try {
-            return CharsetUtils.forName(name);
-        } catch (Exception e) {
-            return ASCII;
-        }
-    }
-
-    protected static boolean isHexChar(int ch) {
-        return (ch >= '0' && ch <= '9') ||
-                (ch >= 'a' && ch <= 'f') ||
-                (ch >= 'A' && ch <= 'F');
-    }
-
-    private static boolean isAlpha(int ch) {
-        return (ch >= 'a' && ch <= 'z') ||
-                (ch >= 'A' && ch <= 'Z');
-    }
-
-    private static boolean isDigit(int ch) {
-        return ch >= '0' && ch <= '9';
-    }
-
-    protected static int hexValue(int ch) {
-        if (ch >= '0' && ch <= '9') {
-            return ch - '0';
-        } else if (ch >= 'a' && ch <= 'z') {
-            return 10 + (ch - 'a');
-        } else {
-            assert ch >= 'A' && ch <= 'Z';
-            return 10 + (ch - 'A');
-        }
-    }
-
-    public boolean isIgnoringLists() {
-        return ignoreLists;
-    }
-
-    public void setIgnoreLists(boolean ignore) {
-        this.ignoreLists = ignore;
-    }
-
-    // Push pending bytes or pending chars:
-    private void pushText() throws IOException, SAXException, TikaException {
-        if (pendingByteCount != 0) {
-            assert pendingCharCount == 0;
-            pushBytes();
-        } else {
-            pushChars();
-        }
-    }
-
-    // Buffers the byte (unit in the current charset) for
-    // output:
-    private void addOutputByte(int b) throws IOException, SAXException, TikaException {
-        assert b >= 0 && b < 256 : "byte value out of range: " + b;
-
-        if (pendingCharCount != 0) {
-            pushChars();
-        }
-        if (groupState.pictDepth > 0) {
-            embObjHandler.writeMetadataChar((char) b);
-        } else {
-            // Save the byte in pending buffer:
-            if (pendingByteCount == pendingBytes.length) {
-                // Gradual but exponential growth:
-                final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
-                System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
-                pendingBytes = newArray;
-                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-            }
-            pendingBytes[pendingByteCount++] = (byte) b;
-        }
-    }
-
-    // Buffers a byte as part of a control word:
-    private void addControl(int b) {
-        assert isAlpha(b);
-        // Save the byte in pending buffer:
-        if (pendingControlCount == pendingControl.length) {
-            // Gradual but exponential growth:
-            final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
-            System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
-            pendingControl = newArray;
-        }
-        pendingControl[pendingControlCount++] = (byte) b;
-    }
-
-    // Buffers a UTF16 code unit for output
-    private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
-        if (pendingByteCount != 0) {
-            pushBytes();
-        }
-
-        if (inHeader || fieldState == 1) {
-            pendingBuffer.append(ch);
-        } else if (groupState.sn == true || groupState.sv == true) {
-            embObjHandler.writeMetadataChar(ch);
-        } else {
-            if (pendingCharCount == pendingChars.length) {
-                // Gradual but exponential growth:
-                final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
-                System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
-                pendingChars = newArray;
-            }
-            pendingChars[pendingCharCount++] = ch;
-        }
-    }
-
-    // Shallow parses the entire doc, writing output to
-    // this.out and this.metadata
-    public void extract(InputStream in) throws IOException, SAXException, TikaException {
-//        in = new FilterInputStream(in) {
-//            public int read() throws IOException {
-//                int r = super.read();
-//                System.out.write(r);
-//                System.out.flush();
-//                return r;
-//            }
-//            public int read(byte b[], int off, int len) throws IOException {
-//                int r = super.read(b, off, len);
-//                System.out.write(b, off, r);
-//                System.out.flush();
-//                return r;
-//            }
-//        };
-        extract(new PushbackInputStream(in, 2));
-    }
-
-    private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        out.startDocument();
-
-        while (true) {
-            final int b = in.read();
-            if (b == -1) {
-                break;
-            } else if (b == '\\') {
-                parseControlToken(in);
-            } else if (b == '{') {
-                pushText();
-                processGroupStart(in);
-            } else if (b == '}') {
-                pushText();
-                processGroupEnd();
-                if (groupStates.isEmpty()) {
-                    // parsed document closing brace
-                    break;
-                }
-            } else if (groupState.objdata == true ||
-                    groupState.pictDepth == 1) {
-                embObjHandler.writeHexChar(b);
-            } else if (b != '\r' && b != '\n'
-                    && (!groupState.ignore || nextMetaData != null ||
-                    groupState.sn == true || groupState.sv == true)) {
-                // Linefeed and carriage return are not
-                // significant
-                if (ansiSkip != 0) {
-                    ansiSkip--;
-                } else {
-                    addOutputByte(b);
-                }
-            }
-        }
-
-        endParagraph(false);
-        out.endDocument();
-    }
-
-    private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        int b = in.read();
-        if (b == '\'') {
-            // escaped hex char
-            parseHexChar(in);
-        } else if (isAlpha(b)) {
-            // control word
-            parseControlWord((char) b, in);
-        } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
-            // escaped char
-            addOutputByte(b);
-        } else if (b != -1) {
-            // control symbol, eg \* or \~
-            processControlSymbol((char) b);
-        }
-    }
-
-    private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        int hex1 = in.read();
-        if (!isHexChar(hex1)) {
-            // DOC ERROR (malformed hex escape): ignore 
-            in.unread(hex1);
-            return;
-        }
-
-        int hex2 = in.read();
-        if (!isHexChar(hex2)) {
-            // TODO: log a warning here, somehow?
-            // DOC ERROR (malformed hex escape):
-            // ignore
-            in.unread(hex2);
-            return;
-        }
-
-        if (ansiSkip != 0) {
-            // Skip this ansi char since we are
-            // still in the shadow of a unicode
-            // escape:
-            ansiSkip--;
-        } else {
-            // Unescape:
-            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
-        }
-    }
-
-    private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
-        addControl(firstChar);
-
-        int b = in.read();
-        while (isAlpha(b)) {
-            addControl(b);
-            b = in.read();
-        }
-
-        boolean hasParam = false;
-        boolean negParam = false;
-        if (b == '-') {
-            negParam = true;
-            hasParam = true;
-            b = in.read();
-        }
-
-        int param = 0;
-        while (isDigit(b)) {
-            param *= 10;
-            param += (b - '0');
-            hasParam = true;
-            b = in.read();
-        }
-
-        // space is consumed as part of the
-        // control word, but is not added to the
-        // control word
-        if (b != ' ') {
-            in.unread(b);
-        }
-
-        if (hasParam) {
-            if (negParam) {
-                param = -param;
-            }
-            processControlWord(param, in);
-        } else {
-            processControlWord();
-        }
-
-        pendingControlCount = 0;
-    }
-
-    private void lazyStartParagraph() throws IOException, SAXException, TikaException {
-        if (!inParagraph) {
-            // Ensure </i></b> order
-            if (groupState.italic) {
-                end("i");
-            }
-            if (groupState.bold) {
-                end("b");
-            }
-            if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
-                endList(pendingListEnd);
-                pendingListEnd = 0;
-            }
-            if (inList() && pendingListEnd != groupState.list) {
-                startList(groupState.list);
-            }
-            if (inList()) {
-                out.startElement("li");
-            } else {
-                out.startElement("p");
-            }
-
-            // Ensure <b><i> order
-            if (groupState.bold) {
-                start("b");
-            }
-            if (groupState.italic) {
-                start("i");
-            }
-            inParagraph = true;
-        }
-    }
-
-    private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
-        pushText();
-        //maintain consecutive new lines
-        if (!inParagraph) {
-            lazyStartParagraph();
-        }
-        if (inParagraph) {
-            if (groupState.italic) {
-                end("i");
-                groupState.italic = preserveStyles;
-            }
-            if (groupState.bold) {
-                end("b");
-                groupState.bold = preserveStyles;
-            }
-            if (inList()) {
-                out.endElement("li");
-            } else {
-                out.endElement("p");
-            }
-
-            if (preserveStyles && (groupState.bold || groupState.italic)) {
-                start("p");
-                if (groupState.bold) {
-                    start("b");
-                }
-                if (groupState.italic) {
-                    start("i");
-                }
-                inParagraph = true;
-            } else {
-                inParagraph = false;
-            }
-        }
-
-        // Ensure closing the list at document end
-        if (!preserveStyles && pendingListEnd != 0) {
-            endList(pendingListEnd);
-            pendingListEnd = 0;
-        }
-    }
-
-    // Push pending UTF16 units to out ContentHandler
-    private void pushChars() throws IOException, SAXException, TikaException {
-        if (pendingCharCount != 0) {
-            lazyStartParagraph();
-            out.characters(pendingChars, 0, pendingCharCount);
-            pendingCharCount = 0;
-        }
-    }
-
-    // Decodes the buffered bytes in pendingBytes
-    // into UTF16 code units, and sends the characters
-    // to the out ContentHandler, if we are in the body,
-    // else appends the characters to the pendingBuffer
-    private void pushBytes() throws IOException, SAXException, TikaException {
-        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
-
-            final CharsetDecoder decoder = getDecoder();
-            pendingByteBuffer.limit(pendingByteCount);
-            assert pendingByteBuffer.position() == 0;
-            assert outputBuffer.position() == 0;
-
-            while (true) {
-                // We pass true for endOfInput because, when
-                // we are called, we should have seen a
-                // complete sequence of characters for this
-                // charset:
-                final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
-
-                final int pos = outputBuffer.position();
-                if (pos > 0) {
-                    if (inHeader || fieldState == 1) {
-                        pendingBuffer.append(outputArray, 0, pos);
-                    } else {
-                        lazyStartParagraph();
-                        out.characters(outputArray, 0, pos);
-                    }
-                    outputBuffer.position(0);
-                }
-
-                if (result == CoderResult.UNDERFLOW) {
-                    break;
-                }
-            }
-
-            while (true) {
-                final CoderResult result = decoder.flush(outputBuffer);
-
-                final int pos = outputBuffer.position();
-                if (pos > 0) {
-                    if (inHeader || fieldState == 1) {
-                        pendingBuffer.append(outputArray, 0, pos);
-                    } else {
-                        lazyStartParagraph();
-                        out.characters(outputArray, 0, pos);
-                    }
-                    outputBuffer.position(0);
-                }
-
-                if (result == CoderResult.UNDERFLOW) {
-                    break;
-                }
-            }
-
-            // Reset for next decode
-            decoder.reset();
-            pendingByteBuffer.position(0);
-        }
-
-        pendingByteCount = 0;
-    }
-
-    // NOTE: s must be ascii alpha only
-    private boolean equals(String s) {
-        if (pendingControlCount != s.length()) {
-            return false;
-        }
-        for (int idx = 0; idx < pendingControlCount; idx++) {
-            assert isAlpha(s.charAt(idx));
-            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
-        switch (ch) {
-            case '~':
-                // Non-breaking space -> unicode NON-BREAKING SPACE
-                addOutputChar('\u00a0');
-                break;
-            case '*':
-                // Ignorable destination (control words defined after
-                // the 1987 RTF spec). These are already handled by
-                // processGroupStart()
-                break;
-            case '-':
-                // Optional hyphen -> unicode SOFT HYPHEN
-                addOutputChar('\u00ad');
-                break;
-            case '_':
-                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
-                addOutputChar('\u2011');
-                break;
-            default:
-                break;
-        }
-    }
-
-    private CharsetDecoder getDecoder() throws TikaException {
-        Charset charset = getCharset();
-
-        // Common case: charset is same as last time, so
-        // just reuse it:
-        if (lastCharset == null || !charset.equals(lastCharset)) {
-            decoder = charset.newDecoder();
-            decoder.onMalformedInput(CodingErrorAction.REPLACE);
-            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
-            lastCharset = charset;
-        }
-
-        return decoder;
-    }
-
-    // Return current charset in-use
-    private Charset getCharset() throws TikaException {
-        // If a specific font (fN) was set, use its charset
-        if (groupState.fontCharset != null) {
-            return groupState.fontCharset;
-        }
-
-        // Else, if global default font (defN) was set, use that one
-        if (globalDefaultFont != -1 && !inHeader) {
-            Charset cs = fontToCharset.get(globalDefaultFont);
-            if (cs != null) {
-                return cs;
-            }
-        }
-
-        // Else, use the global charset
-        if (globalCharset == null) {
-            throw new TikaException("unable to determine charset");
-        }
-
-        return globalCharset;
-    }
-
-    // Handle control word that takes a parameter:
-    private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
-        // TODO: afN?  (associated font number)
-
-        // TODO: do these alter text output...?
-        /*
-            } else if (equals("stshfdbch")) {
-                // font to be used by default in
-                // style sheet for East Asian chars
-                // arg N is font table entry
-            } else if (equals("stshfloch")) {
-                // font to be used by default in
-                // style sheet for ASCII chars
-                // arg N is font table entry
-            } else if (equals("stshfhich")) {
-                // font to be used by default in
-                // style sheet for High Ansi chars
-                // arg N is font table entry
-            } else if (equals("stshfbi")) {
-                // style sheet for Complex Scripts (BIDI) chars
-                // arg N is font table entry
-                */
-
-        // TODO: inefficient that we check equals N times;
-        // we'd get better perf w/ real lexer (eg
-        // JFlex), which uses single-pass FSM to do cmp:
-        if (inHeader) {
-            if (equals("ansicpg")) {
-                // ANSI codepage
-                Charset cs = ANSICPG_MAP.get(param);
-                if (cs != null) {
-                    globalCharset = cs;
-                }
-            } else if (equals("deff")) {
-                // Default font
-                globalDefaultFont = param;
-            } else if (equals("nofpages")) {
-                metadata.add(Office.PAGE_COUNT, Integer.toString(param));
-            } else if (equals("nofwords")) {
-                metadata.add(Office.WORD_COUNT, Integer.toString(param));
-            } else if (equals("nofchars")) {
-                metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
-            } else if (equals("yr")) {
-                year = param;
-            } else if (equals("mo")) {
-                month = param;
-            } else if (equals("dy")) {
-                day = param;
-            } else if (equals("hr")) {
-                hour = param;
-            } else if (equals("min")) {
-                minute = param;
-            }
-
-            if (fontTableState == 1) {
-                // Still inside font table -- record the
-                // mappings of fN to the fcharset:
-                if (groupState.depth < fontTableDepth) {
-                    fontTableState = 2;
-                } else {
-                    if (equals("f")) {
-                        // Start new font definition
-                        curFontID = param;
-                    } else if (equals("fcharset")) {
-                        Charset cs = FCHARSET_MAP.get(param);
-                        if (cs != null) {
-                            fontToCharset.put(curFontID, cs);
-                        }
-                    }
-                }
-            }
-
-            if (currentList != null) {
-                if (equals("listid")) {
-                    currentList.id = param;
-                    currentListTable.put(currentList.id, currentList);
-                } else if (equals("listtemplateid")) {
-                    currentList.templateID = param;
-                } else if (equals("levelnfc") || equals("levelnfcn")) {
-                    //sanity check to make sure list information isn't corrupt
-                    if (listTableLevel > -1 &&
-                            listTableLevel < currentList.numberType.length) {
-                        currentList.numberType[listTableLevel] = param;
-                    }
-                }
-            }
-        } else {
-            // In document
-            if (equals("b")) {
-                // b0
-                assert param == 0;
-                if (groupState.bold) {
-                    pushText();
-                    if (groupState.italic) {
-                        end("i");
-                    }
-                    end("b");
-                    if (groupState.italic) {
-                        start("i");
-                    }
-                    groupState.bold = false;
-                }
-            } else if (equals("i")) {
-                // i0
-                assert param == 0;
-                if (groupState.italic) {
-                    pushText();
-                    end("i");
-                    groupState.italic = false;
-                }
-            } else if (equals("f")) {
-                // Change current font
-                Charset fontCharset = fontToCharset.get(param);
-
-                // Push any buffered text before changing
-                // font:
-                pushText();
-
-                if (fontCharset != null) {
-                    groupState.fontCharset = fontCharset;
-                } else {
-                    // DOC ERROR: font change referenced a
-                    // non-table'd font number
-                    // TODO: log a warning?  Throw an exc?
-                    groupState.fontCharset = null;
-                }
-            } else if (equals("ls")) {
-                groupState.list = param;
-            } else if (equals("lslvl")) {
-                groupState.listLevel = param;
-            }
-        }
-
-        // Process unicode escape. This can appear in doc
-        // or in header, since the metadata (info) fields
-        // in the header can be unicode escaped as well:
-        if (equals("u")) {
-            // Unicode escape
-            if (!groupState.ignore || groupState.sv || groupState.sn) {
-                final char utf16CodeUnit = (char) (param & 0xffff);
-                addOutputChar(utf16CodeUnit);
-            }
-
-            // After seeing a unicode escape we must
-            // skip the next ucSkip ansi chars (the
-            // "unicode shadow")
-            ansiSkip = groupState.ucSkip;
-        } else if (equals("uc")) {
-            // Change unicode shadow length
-            groupState.ucSkip = param;
-        } else if (equals("bin")) {
-            if (param >= 0) {
-                if (groupState.pictDepth == 1) {
-                    try {
-                        embObjHandler.writeBytes(in, param);
-                    } catch (IOException e) {
-                        //param was out of bounds or something went wrong during writing.
-                        //skip this obj and move on
-                        //TODO: log.warn
-                        embObjHandler.reset();
-                    }
-                } else {
-                    IOUtils.skipFully(in, param);
-                }
-            } else {
-                // log some warning?
-            }
-        }
-    }
-
-    private boolean inList() {
-        return !ignoreLists && groupState.list != 0;
-    }
-
-    /**
-     * Marks the current list as pending to end. This is done to be able to merge list items of
-     * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
-     * <code>"ol"</code>).
-     */
-    private void pendingListEnd() {
-        pendingListEnd = groupState.list;
-        groupState.list = 0;
-    }
-
-    /**
-     * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
-     * type for the given <code>listID</code>.
-     *
-     * @param listID The ID of the list.
-     * @throws IOException
-     * @throws SAXException
-     * @throws TikaException
-     */
-    private void endList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
-            out.endElement(isUnorderedList(listID) ? "ul" : "ol");
-        }
-    }
-
-    /**
-     * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
-     * type for the given <code>listID</code>.
-     *
-     * @param listID The ID of the list.
-     * @throws IOException
-     * @throws SAXException
-     * @throws TikaException
-     */
-    private void startList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
-            out.startElement(isUnorderedList(listID) ? "ul" : "ol");
-        }
-    }
-
-    private boolean isUnorderedList(int listID) {
-        ListDescriptor list = listTable.get(listID);
-        if (list != null) {
-            return list.isUnordered(groupState.listLevel);
-        }
-        return true;
-    }
-
-    private void end(String tag) throws IOException, SAXException, TikaException {
-        out.endElement(tag);
-    }
-
-    private void start(String tag) throws IOException, SAXException, TikaException {
-        out.startElement(tag);
-    }
-
-    // Handle non-parameter control word:
-    private void processControlWord() throws IOException, SAXException, TikaException {
-        if (inHeader) {
-            if (equals("ansi")) {
-                globalCharset = WINDOWS_1252;
-            } else if (equals("pca")) {
-                globalCharset = CP850;
-            } else if (equals("pc")) {
-                globalCharset = CP437;
-            } else if (equals("mac")) {
-                globalCharset = MAC_ROMAN;
-            }
-
-            if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
-                groupState.ignore = true;
-            } else if (equals("listtable")) {
-                currentListTable = listTable;
-            } else if (equals("listoverridetable")) {
-                currentListTable = listOverrideTable;
-            }
-
-            if (uprState == -1) {
-                // TODO: we can also parse \creatim, \revtim,
-                // \printim, \version, etc.
-                if (equals("author")) {
-                    nextMetaData = TikaCoreProperties.CREATOR;
-                } else if (equals("title")) {
-                    nextMetaData = TikaCoreProperties.TITLE;
-                } else if (equals("subject")) {
-                    // TODO: Move to OO subject in Tika 2.0
-                    nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
-                } else if (equals("keywords")) {
-                    nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
-                } else if (equals("category")) {
-                    nextMetaData = OfficeOpenXMLCore.CATEGORY;
-                } else if (equals("comment")) {
-                    nextMetaData = TikaCoreProperties.COMMENTS;
-                } else if (equals("company")) {
-                    nextMetaData = OfficeOpenXMLExtended.COMPANY;
-                } else if (equals("manager")) {
-                    nextMetaData = OfficeOpenXMLExtended.MANAGER;
-                } else if (equals("template")) {
-                    nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
-                } else if (equals("creatim")) {
-                    nextMetaData = TikaCoreProperties.CREATED;
-                }
-            }
-
-            if (fontTableState == 0) {
-                // Didn't see font table yet
-                if (equals("fonttbl")) {
-                    fontTableState = 1;
-                    fontTableDepth = groupState.depth;
-                }
-            } else if (fontTableState == 1) {
-                // Inside font table
-                if (groupState.depth < fontTableDepth) {
-                    fontTableState = 2;
-                }
-            }
-
-            // List table handling
-            if (currentListTable != null) {
-                if (equals("list") || equals("listoverride")) {
-                    currentList = new ListDescriptor();
-                    listTableLevel = -1;
-                } else if (currentList != null) {
-                    if (equals("liststylename")) {
-                        currentList.isStyle = true;
-                    } else if (equals("listlevel")) {
-                        listTableLevel++;
-                    }
-                }
-            }
-
-            if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
-                inHeader = false;
-            }
-        } else {
-            if (equals("b")) {
-                if (!groupState.bold) {
-                    pushText();
-                    lazyStartParagraph();
-                    if (groupState.italic) {
-                        // Make sure nesting is always <b><i>
-                        end("i");
-                    }
-                    groupState.bold = true;
-                    start("b");
-                    if (groupState.italic) {
-                        start("i");
-                    }
-                }
-            } else if (equals("i")) {
-                if (!groupState.italic) {
-                    pushText();
-                    lazyStartParagraph();
-                    groupState.italic = true;
-                    start("i");
-                }
-            }
-        }
-
-        final boolean ignored = groupState.ignore;
-
-        if (equals("pard")) {
-            // Reset styles
-            pushText();
-            if (groupState.italic) {
-                end("i");
-                groupState.italic = false;
-            }
-            if (groupState.bold) {
-                end("b");
-                groupState.bold = false;
-            }
-            if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
-                pendingListEnd();
-            }
-        } else if (equals("par")) {
-            if (!ignored) {
-                endParagraph(true);
-            }
-        } else if (equals("shptxt")) {
-            pushText();
-            // Text inside a shape
-            groupState.ignore = false;
-        } else if (equals("atnid")) {
-            pushText();
-            // Annotation ID
-            groupState.ignore = false;
-        } else if (equals("atnauthor")) {
-            pushText();
-            // Annotation author
-            groupState.ignore = false;
-        } else if (equals("annotation")) {
-            pushText();
-            // Annotation
-            groupState.ignore = false;
-        } else if (equals("listtext")) {
-            groupState.ignore = true;
-        } else if (equals("cell")) {
-            // TODO: we should produce a table output here?
-            //addOutputChar(' ');
-            endParagraph(true);
-        } else if (equals("sp")) {
-            groupState.sp = true;
-        } else if (equals("sn")) {
-            embObjHandler.startSN();
-            groupState.sn = true;
-        } else if (equals("sv")) {
-            embObjHandler.startSV();
-            groupState.sv = true;
-        } else if (equals("object")) {
-            pushText();
-            embObjHandler.setInObject(true);
-            groupState.object = true;
-        } else if (equals("objdata")) {
-            groupState.objdata = true;
-            embObjHandler.startObjData();
-        } else if (equals("pict")) {
-            pushText();
-            // TODO: create img tag?  but can that support
-            // embedded image data?
-            groupState.pictDepth = 1;
-            embObjHandler.startPict();
-        } else if (equals("line")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("column")) {
-            if (!ignored) {
-                addOutputChar(' ');
-            }
-        } else if (equals("page")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("softline")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("softcolumn")) {
-            if (!ignored) {
-                addOutputChar(' ');
-            }
-        } else if (equals("softpage")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("tab")) {
-            if (!ignored) {
-                addOutputChar('\t');
-            }
-        } else if (equals("upr")) {
-            uprState = 0;
-        } else if (equals("ud") && uprState == 1) {
-            uprState = -1;
-            // 2nd group inside the upr destination, which
-            // contains the unicode encoding of the text, so
-            // we want to keep that:
-            groupState.ignore = false;
-        } else if (equals("bullet")) {
-            if (!ignored) {
-                // unicode BULLET
-                addOutputChar('\u2022');
-            }
-        } else if (equals("endash")) {
-            if (!ignored) {
-                // unicode EN DASH
-                addOutputChar('\u2013');
-            }
-        } else if (equals("emdash")) {
-            if (!ignored) {
-                // unicode EM DASH
-                addOutputChar('\u2014');
-            }
-        } else if (equals("enspace")) {
-            if (!ignored) {
-                // unicode EN SPACE
-                addOutputChar('\u2002');
-            }
-        } else if (equals("qmspace")) {
-            if (!ignored) {
-                // quarter em space -> unicode FOUR-PER-EM SPACE
-                addOutputChar('\u2005');
-            }
-        } else if (equals("emspace")) {
-            if (!ignored) {
-                // unicode EM SPACE
-                addOutputChar('\u2003');
-            }
-        } else if (equals("lquote")) {
-            if (!ignored) {
-                // unicode LEFT SINGLE QUOTATION MARK
-                addOutputChar('\u2018');
-            }
-        } else if (equals("rquote")) {
-            if (!ignored) {
-                // unicode RIGHT SINGLE QUOTATION MARK
-                addOutputChar('\u2019');
-            }
-        } else if (equals("ldblquote")) {
-            if (!ignored) {
-                // unicode LEFT DOUBLE QUOTATION MARK
-                addOutputChar('\u201C');
-            }
-        } else if (equals("rdblquote")) {
-            if (!ignored) {
-                // unicode RIGHT DOUBLE QUOTATION MARK
-                addOutputChar('\u201D');
-            }
-        } else if (equals("fldinst")) {
-            fieldState = 1;
-            groupState.ignore = false;
-        } else if (equals("fldrslt") && fieldState == 2) {
-            assert pendingURL != null;
-            lazyStartParagraph();
-            out.startElement("a", "href", pendingURL);
-            pendingURL = null;
-            fieldState = 3;
-            groupState.ignore = false;
-        }
-    }
-
-    // Push new GroupState
-    private void processGroupStart(PushbackInputStream in) throws IOException {
-        ansiSkip = 0;
-        // Push current groupState onto the stack
-        groupStates.add(groupState);
-
-        // Make new GroupState
-        groupState = new GroupState(groupState);
-        assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth;
-
-        if (uprState == 0) {
-            uprState = 1;
-            groupState.ignore = true;
-        }
-
-        // Check for ignorable groups. Note that
-        // sometimes we un-ignore within this group, eg
-        // when handling upr escape.
-        int b2 = in.read();
-        if (b2 == '\\') {
-            int b3 = in.read();
-            if (b3 == '*') {
-                groupState.ignore = true;
-            }
-            in.unread(b3);
-        }
-        in.unread(b2);
-    }
-
-    // Pop current GroupState
-    private void processGroupEnd() throws IOException, SAXException, TikaException {
-        if (inHeader) {
-            if (nextMetaData != null) {
-                if (nextMetaData == TikaCoreProperties.CREATED) {
-                    Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
-                    cal.set(year, month - 1, day, hour, minute, 0);
-                    metadata.set(nextMetaData, cal.getTime());
-                } else if (nextMetaData.isMultiValuePermitted()) {
-                    metadata.add(nextMetaData, pendingBuffer.toString());
-                } else {
-                    metadata.set(nextMetaData, pendingBuffer.toString());
-                }
-                nextMetaData = null;
-            }
-            pendingBuffer.setLength(0);
-        }
-
-        assert groupState.depth > 0;
-        ansiSkip = 0;
-
-        if (groupState.objdata == true) {
-            embObjHandler.handleCompletedObject();
-            groupState.objdata = false;
-        } else if (groupState.pictDepth > 0) {
-            if (groupState.sn == true) {
-                embObjHandler.endSN();
-            } else if (groupState.sv == true) {
-                embObjHandler.endSV();
-            } else if (groupState.sp == true) {
-                embObjHandler.endSP();
-            } else if (groupState.pictDepth == 1) {
-                embObjHandler.handleCompletedObject();
-            }
-        }
-
-        if (groupState.object == true) {
-            embObjHandler.setInObject(false);
-        }
-
-        // Be robust if RTF doc is corrupt (has too many
-        // closing }s):
-        // TODO: log a warning?
-        if (groupStates.size() > 0) {
-            // Restore group state:
-            final GroupState outerGroupState = groupStates.removeLast();
-
-            // Close italic, if outer does not have italic or
-            // bold changed:
-            if (groupState.italic) {
-                if (!outerGroupState.italic ||
-                        groupState.bold != outerGroupState.bold) {
-                    end("i");
-                    groupState.italic = false;
-                }
-            }
-
-            // Close bold
-            if (groupState.bold && !outerGroupState.bold) {
-                end("b");
-            }
-
-            // Open bold
-            if (!groupState.bold && outerGroupState.bold) {
-                start("b");
-            }
-
-            // Open italic
-            if (!groupState.italic && outerGroupState.italic) {
-                start("i");
-            }
-            groupState = outerGroupState;
-        }
-        assert groupStates.size() == groupState.depth;
-
-        if (fieldState == 1) {
-            String s = pendingBuffer.toString().trim();
-            pendingBuffer.setLength(0);
-            if (s.startsWith("HYPERLINK")) {
-                s = s.substring(9).trim();
-                // TODO: what other instructions can be in a
-                // HYPERLINK destination?
-                final boolean isLocalLink = s.contains("\\l ");
-                int idx = s.indexOf('"');
-                if (idx != -1) {
-                    int idx2 = s.indexOf('"', 1 + idx);
-                    if (idx2 != -1) {
-                        s = s.substring(1 + idx, idx2);
-                    }
-                }
-                pendingURL = (isLocalLink ? "#" : "") + s;
-                fieldState = 2;
-            } else {
-                fieldState = 0;
-            }
-
-            // TODO: we could process the other known field
-            // types.  Right now, we will extract their text
-            // inlined, but fail to record them in metadata
-            // as a field value.
-        } else if (fieldState == 3) {
-            out.endElement("a");
-            fieldState = 0;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+    private static final Charset ASCII = Charset.forName("US-ASCII");
+    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
+    private static final Charset MAC_ROMAN = getCharset("MacRoman");
+    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
+    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
+    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
+    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
+    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
+    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
+    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
+    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
+    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
+    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
+    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
+    private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
+    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
+    private static final Charset X_JOHAB = getCharset("x-Johab");
+    private static final Charset CP12582 = getCharset("CP1258");
+    private static final Charset CP12572 = getCharset("CP1257");
+    private static final Charset CP12562 = getCharset("CP1256");
+    private static final Charset CP12552 = getCharset("CP1255");
+    private static final Charset CP12542 = getCharset("CP1254");
+    private static final Charset CP12532 = getCharset("CP1253");
+    private static final Charset CP1252 = getCharset("CP1252");
+    private static final Charset CP12512 = getCharset("CP1251");
+    private static final Charset CP12502 = getCharset("CP1250");
+    private static final Charset CP950 = getCharset("CP950");
+    private static final Charset CP949 = getCharset("CP949");
+    private static final Charset MS9362 = getCharset("MS936");
+    private static final Charset MS8742 = getCharset("MS874");
+    private static final Charset CP866 = getCharset("CP866");
+    private static final Charset CP865 = getCharset("CP865");
+    private static final Charset CP864 = getCharset("CP864");
+    private static final Charset CP863 = getCharset("CP863");
+    private static final Charset CP862 = getCharset("CP862");
+    private static final Charset CP860 = getCharset("CP860");
+    private static final Charset CP852 = getCharset("CP852");
+    private static final Charset CP8502 = getCharset("CP850");
+    private static final Charset CP819 = getCharset("CP819");
+    private static final Charset WINDOWS_720 = getCharset("windows-720");
+    private static final Charset WINDOWS_711 = getCharset("windows-711");
+    private static final Charset WINDOWS_710 = getCharset("windows-710");
+    private static final Charset WINDOWS_709 = getCharset("windows-709");
+    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
+    private static final Charset CP4372 = getCharset("CP437");
+    private static final Charset CP850 = getCharset("cp850");
+    private static final Charset CP437 = getCharset("cp437");
+    private static final Charset MS874 = getCharset("ms874");
+    private static final Charset CP1257 = getCharset("cp1257");
+    private static final Charset CP1256 = getCharset("cp1256");
+    private static final Charset CP1255 = getCharset("cp1255");
+    private static final Charset CP1258 = getCharset("cp1258");
+    private static final Charset CP1254 = getCharset("cp1254");
+    private static final Charset CP1253 = getCharset("cp1253");
+    private static final Charset MS950 = getCharset("ms950");
+    private static final Charset MS936 = getCharset("ms936");
+    private static final Charset MS1361 = getCharset("ms1361");
+    private static final Charset MS932 = getCharset("MS932");
+    private static final Charset CP1251 = getCharset("cp1251");
+    private static final Charset CP1250 = getCharset("cp1250");
+    private static final Charset MAC_THAI = getCharset("MacThai");
+    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
+    private static final Charset MAC_GREEK = getCharset("MacGreek");
+    private static final Charset MAC_ARABIC = getCharset("MacArabic");
+    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
+    private static final Charset JOHAB = getCharset("johab");
+    private static final Charset BIG5 = getCharset("Big5");
+    private static final Charset GB2312 = getCharset("GB2312");
+    private static final Charset MS949 = getCharset("ms949");
+    // The RTF doc has a "font table" that assigns ords
+    // (f0, f1, f2, etc.) to fonts and charsets, using the
+    // \fcharsetN control word.  This mapping maps from the
+    // N to corresponding Java charset:
+    private static final Map<Integer, Charset> FCHARSET_MAP =
+            new HashMap<Integer, Charset>();
+    // The RTF may specify the \ansicpgN charset in the
+    // header; this maps the N to the corresponding Java
+    // character set:
+    private static final Map<Integer, Charset> ANSICPG_MAP =
+            new HashMap<Integer, Charset>();
+
+    static {
+        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
+        // charset 1 is Default
+        // charset 2 is Symbol
+
+        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
+        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
+        FCHARSET_MAP.put(79, MS949); // Mac Hangul
+        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
+        FCHARSET_MAP.put(81, BIG5); // Mac Big5
+        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
+        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
+        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
+        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
+        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
+        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
+        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
+        FCHARSET_MAP.put(89, CP1251); // Mac Russian
+
+        FCHARSET_MAP.put(128, MS932); // Shift JIS
+        FCHARSET_MAP.put(129, MS949); // Hangul
+        FCHARSET_MAP.put(130, MS1361); // Johab
+        FCHARSET_MAP.put(134, MS936); // GB2312
+        FCHARSET_MAP.put(136, MS950); // Big5
+        FCHARSET_MAP.put(161, CP1253); // Greek
+        FCHARSET_MAP.put(162, CP1254); // Turkish
+        FCHARSET_MAP.put(163, CP1258); // Vietnamese
+        FCHARSET_MAP.put(177, CP1255); // Hebrew
+        FCHARSET_MAP.put(178, CP1256); // Arabic
+        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+        // FCHARSET_MAP.put( 180, "" ); // Arabic user
+        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+        FCHARSET_MAP.put(186, CP1257); // Baltic
+
+        FCHARSET_MAP.put(204, CP1251); // Russian
+        FCHARSET_MAP.put(222, MS874); // Thai
+        FCHARSET_MAP.put(238, CP1250); // Eastern European
+        FCHARSET_MAP.put(254, CP437); // PC 437
+        FCHARSET_MAP.put(255, CP850); // OEM
+    }
+
+    static {
+        ANSICPG_MAP.put(437, CP4372);   // US IBM
+        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
+
+        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
+        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
+        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
+        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
+        ANSICPG_MAP.put(852, CP852);  // Eastern European
+        ANSICPG_MAP.put(860, CP860);  // Portuguese
+        ANSICPG_MAP.put(862, CP862);  // Hebrew
+        ANSICPG_MAP.put(863, CP863);  // French Canadian
+        ANSICPG_MAP.put(864, CP864);  // Arabic
+        ANSICPG_MAP.put(865, CP865);  // Norwegian
+        ANSICPG_MAP.put(866, CP866);  // Soviet Union
+        ANSICPG_MAP.put(874, MS8742);  // Thai
+        ANSICPG_MAP.put(932, MS932);  // Japanese
+        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
+        ANSICPG_MAP.put(949, CP949);  // Korean
+        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
+        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
+        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
+        ANSICPG_MAP.put(1252, CP1252);  // Western European
+        ANSICPG_MAP.put(1253, CP12532);  // Greek
+        ANSICPG_MAP.put(1254, CP12542);  // Turkish
+        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
+        ANSICPG_MAP.put(1256, CP12562);  // Arabic
+        ANSICPG_MAP.put(1257, CP12572);  // Baltic
+        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
+        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
+        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
+        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
+        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
+        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
+        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
+        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
+        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
+        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
+        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari
+
+        // TODO: in theory these other charsets are simple
+        // shifts off of Devanagari, so we could impl that
+        // here:
+        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
+        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
+        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
+        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
+        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
+        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
+        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
+        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
+        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
+    }
+
+    // Used when we decode bytes -> chars using CharsetDecoder:
+    private final char[] outputArray = new char[128];
+    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+    // Holds the font table from this RTF doc, mapping
+    // the font number (from \fN control word) to the
+    // corresponding charset:
+    private final Map<Integer, Charset> fontToCharset =
+            new HashMap<Integer, Charset>();
+    // Group stack: when we open a new group, we push
+    // the previous group state onto the stack; when we
+    // close the group, we restore it
+    private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+    private final StringBuilder pendingBuffer = new StringBuilder();
+    private final XHTMLContentHandler out;
+    private final Metadata metadata;
+    private final RTFEmbObjHandler embObjHandler;
+    // How many next ansi chars we should skip; this
+    // is 0 except when we are still in the "ansi
+    // shadow" after seeing a unicode escape, at which
+    // point it's set to the last ucN skip we had seen:
+    int ansiSkip = 0;
+    private int written = 0;
+    // Hold pending bytes (encoded in the current charset)
+    // for text output:
+    private byte[] pendingBytes = new byte[16];
+    private int pendingByteCount;
+    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+    // Holds pending chars for text output
+    private char[] pendingChars = new char[10];
+    private int pendingCharCount;
+    // Holds chars for a still-being-tokenized control word
+    private byte[] pendingControl = new byte[10];
+    private int pendingControlCount;
+    // Reused when possible:
+    private CharsetDecoder decoder;
+    private Charset lastCharset;
+    private Charset globalCharset = WINDOWS_1252;
+    private int globalDefaultFont = -1;
+    private int curFontID = -1;
+    // Current group state; in theory this initial
+    // GroupState is unused because the RTF doc should
+    // immediately open the top group (start with {):
+    private GroupState groupState = new GroupState();
+    private boolean inHeader = true;
+    private int fontTableState;
+    private int fontTableDepth;
+    // Non null if we are processing metadata (title,
+    // keywords, etc.) inside the info group:
+    private Property nextMetaData;
+    private boolean inParagraph;
+    // Non-zero if we are processing inside a field destination:
+    private int fieldState;
+    // Non-zero list index
+    private int pendingListEnd;
+    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> currentListTable;
+    private ListDescriptor currentList;
+    private int listTableLevel = -1;
+    private boolean ignoreLists;
+    // Non-null if we've seen the url for a HYPERLINK but not yet
+    // its text:
+    private String pendingURL;
+    // Used to process the sub-groups inside the upr
+    // group:
+    private int uprState = -1;
+    // Used when extracting CREATION date:
+    private int year, month, day, hour, minute;
+
+    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+                         RTFEmbObjHandler embObjHandler) {
+        this.metadata = metadata;
+        this.out = out;
+        this.embObjHandler = embObjHandler;
+    }
+
+    private static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (Exception e) {
+            return ASCII;
+        }
+    }
+
+    protected static boolean isHexChar(int ch) {
+        return (ch >= '0' && ch <= '9') ||
+                (ch >= 'a' && ch <= 'f') ||
+                (ch >= 'A' && ch <= 'F');
+    }
+
+    private static boolean isAlpha(int ch) {
+        return (ch >= 'a' && ch <= 'z') ||
+                (ch >= 'A' && ch <= 'Z');
+    }
+
+    private static boolean isDigit(int ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+    protected static int hexValue(int ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            assert ch >= 'A' && ch <= 'Z';
+            return 10 + (ch - 'A');
+        }
+    }
+
+    public boolean isIgnoringLists() {
+        return ignoreLists;
+    }
+
+    public void setIgnoreLists(boolean ignore) {
+        this.ignoreLists = ignore;
+    }
+
+    // Push pending bytes or pending chars:
+    private void pushText() throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            assert pendingCharCount == 0;
+            pushBytes();
+        } else {
+            pushChars();
+        }
+    }
+
+    // Buffers the byte (unit in the current charset) for
+    // output:
+    private void addOutputByte(int b) throws IOException, SAXException, TikaException {
+        assert b >= 0 && b < 256 : "byte value out of range: " + b;
+
+        if (pendingCharCount != 0) {
+            pushChars();
+        }
+        if (groupState.pictDepth > 0) {
+            embObjHandler.writeMetadataChar((char) b);
+        } else {
+            // Save the byte in pending buffer:
+            if (pendingByteCount == pendingBytes.length) {
+                // Gradual but exponential growth:
+                final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
+                System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+                pendingBytes = newArray;
+                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+            }
+            pendingBytes[pendingByteCount++] = (byte) b;
+        }
+    }
+
+    // Buffers a byte as part of a control word:
+    private void addControl(int b) {
+        assert isAlpha(b);
+        // Save the byte in pending buffer:
+        if (pendingControlCount == pendingControl.length) {
+            // Gradual but exponential growth:
+            final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
+            System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+            pendingControl = newArray;
+        }
+        pendingControl[pendingControlCount++] = (byte) b;
+    }
+
+    // Buffers a UTF16 code unit for output
+    private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            pushBytes();
+        }
+
+        if (inHeader || fieldState == 1) {
+            pendingBuffer.append(ch);
+        } else if (groupState.sn == true || groupState.sv == true) {
+            embObjHandler.writeMetadataChar(ch);
+        } else {
+            if (pendingCharCount == pendingChars.length) {
+                // Gradual but exponential growth:
+                final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
+                System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+                pendingChars = newArray;
+            }
+            pendingChars[pendingCharCount++] = ch;
+        }
+    }
+
+    // Shallow parses the entire doc, writing output to
+    // this.out and this.metadata
+    public void extract(InputStream in) throws IOException, SAXException, TikaException {
+//        in = new FilterInputStream(in) {
+//            public int read() throws IOException {
+//                int r = super.read();
+//                System.out.write(r);
+//                System.out.flush();
+//                return r;
+//            }
+//            public int read(byte b[], int off, int len) throws IOException {
+//                int r = super.read(b, off, len);
+//                System.out.write(b, off, r);
+//                System.out.flush();
+//                return r;
+//            }
+//        };
+        extract(new PushbackInputStream(in, 2));
+    }
+
+    private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        out.startDocument();
+
+        while (true) {
+            final int b = in.read();
+            if (b == -1) {
+                break;
+            } else if (b == '\\') {
+                parseControlToken(in);
+            } else if (b == '{') {
+                pushText();
+                processGroupStart(in);
+            } else if (b == '}') {
+                pushText();
+                processGroupEnd();
+                if (groupStates.isEmpty()) {
+                    // parsed document closing brace
+                    break;
+                }
+            } else if (groupState.objdata == true ||
+                    groupState.pictDepth == 1) {
+                embObjHandler.writeHexChar(b);
+            } else if (b != '\r' && b != '\n'
+                    && (!groupState.ignore || nextMetaData != null ||
+                    groupState.sn == true || groupState.sv == true)) {
+                // Linefeed and carriage return are not
+                // significant
+                if (ansiSkip != 0) {
+                    ansiSkip--;
+                } else {
+                    addOutputByte(b);
+                }
+            }
+        }
+
+        endParagraph(false);
+        out.endDocument();
+    }
+
+    private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        int b = in.read();
+        if (b == '\'') {
+            // escaped hex char
+            parseHexChar(in);
+        } else if (isAlpha(b)) {
+            // control word
+            parseControlWord((char) b, in);
+        } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
+            // escaped char
+            addOutputByte(b);
+        } else if (b != -1) {
+            // control symbol, eg \* or \~
+            processControlSymbol((char) b);
+        }
+    }
+
+    private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        int hex1 = in.read();
+        if (!isHexChar(hex1)) {
+            // DOC ERROR (malformed hex escape): ignore 
+            in.unread(hex1);
+            return;
+        }
+
+        int hex2 = in.read();
+        if (!isHexChar(hex2)) {
+            // TODO: log a warning here, somehow?
+            // DOC ERROR (malformed hex escape):
+            // ignore
+            in.unread(hex2);
+            return;
+        }
+
+        if (ansiSkip != 0) {
+            // Skip this ansi char since we are
+            // still in the shadow of a unicode
+            // escape:
+            ansiSkip--;
+        } else {
+            // Unescape:
+            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
+        }
+    }
+
+    private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
+        addControl(firstChar);
+
+        int b = in.read();
+        while (isAlpha(b)) {
+            addControl(b);
+            b = in.read();
+        }
+
+        boolean hasParam = false;
+        boolean negParam = false;
+        if (b == '-') {
+            negParam = true;
+            hasParam = true;
+            b = in.read();
+        }
+
+        int param = 0;
+        while (isDigit(b)) {
+            param *= 10;
+            param += (b - '0');
+            hasParam = true;
+            b = in.read();
+        }
+
+        // space is consumed as part of the
+        // control word, but is not added to the
+        // control word
+        if (b != ' ') {
+            in.unread(b);
+        }
+
+        if (hasParam) {
+            if (negParam) {
+                param = -param;
+            }
+            processControlWord(param, in);
+        } else {
+            processControlWord();
+        }
+
+        pendingControlCount = 0;
+    }
+
+    private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+        if (!inParagraph) {
+            // Ensure </i></b> order
+            if (groupState.italic) {
+                end("i");
+            }
+            if (groupState.bold) {
+                end("b");
+            }
+            if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
+                endList(pendingListEnd);
+                pendingListEnd = 0;
+            }
+            if (inList() && pendingListEnd != groupState.list) {
+                startList(groupState.list);
+            }
+            if (inList()) {
+                out.startElement("li");
+            } else {
+                out.startElement("p");
+            }
+
+            // Ensure <b><i> order
+            if (groupState.bold) {
+                start("b");
+            }
+            if (groupState.italic) {
+                start("i");
+            }
+            inParagraph = true;
+        }
+    }
+
+    private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+        pushText();
+        //maintain consecutive new lines
+        if (!inParagraph) {
+            lazyStartParagraph();
+        }
+        if (inParagraph) {
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = preserveStyles;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = preserveStyles;
+            }
+            if (inList()) {
+                out.endElement("li");
+            } else {
+                out.endElement("p");
+            }
+
+            if (preserveStyles && (groupState.bold || groupState.italic)) {
+                start("p");
+                if (groupState.bold) {
+                    start("b");
+                }
+                if (groupState.italic) {
+                    start("i");
+                }
+                inParagraph = true;
+            } else {
+                inParagraph = false;
+            }
+        }
+
+        // Ensure closing the list at document end
+        if (!preserveStyles && pendingListEnd != 0) {
+            endList(pendingListEnd);
+            pendingListEnd = 0;
+        }
+    }
+
+    // Push pending UTF16 units to out ContentHandler
+    private void pushChars() throws IOException, SAXException, TikaException {
+        if (pendingCharCount != 0) {
+            lazyStartParagraph();
+            out.characters(pendingChars, 0, pendingCharCount);
+            pendingCharCount = 0;
+        }
+    }
+
+    // Decodes the buffered bytes in pendingBytes
+    // into UTF16 code units, and sends the characters
+    // to the out ContentHandler, if we are in the body,
+    // else appends the characters to the pendingBuffer
+    private void pushBytes() throws IOException, SAXException, TikaException {
+        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+            final CharsetDecoder decoder = getDecoder();
+            pendingByteBuffer.limit(pendingByteCount);
+            assert pendingByteBuffer.position() == 0;
+            assert outputBuffer.position() == 0;
+
+            while (true) {
+                // We pass true for endOfInput because, when
+                // we are called, we should have seen a
+                // complete sequence of characters for this
+                // charset:
+                final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            while (true) {
+                final CoderResult result = decoder.flush(outputBuffer);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            // Reset for next decode
+            decoder.reset();
+            pendingByteBuffer.position(0);
+        }
+
+        pendingByteCount = 0;
+    }
+
+    // NOTE: s must be ascii alpha only
+    private boolean equals(String s) {
+        if (pendingControlCount != s.length()) {
+            return false;
+        }
+        for (int idx = 0; idx < pendingControlCount; idx++) {
+            assert isAlpha(s.charAt(idx));
+            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+        switch (ch) {
+            case '~':
+                // Non-breaking space -> unicode NON-BREAKING SPACE
+                addOutputChar('\u00a0');
+                break;
+            case '*':
+                // Ignorable destination (control words defined after
+                // the 1987 RTF spec). These are already handled by
+                // processGroupStart()
+                break;
+            case '-':
+                // Optional hyphen -> unicode SOFT HYPHEN
+                addOutputChar('\u00ad');
+                break;
+            case '_':
+                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+                addOutputChar('\u2011');
+                break;
+            default:
+                break;
+        }
+    }
+
+    private CharsetDecoder getDecoder() throws TikaException {
+        Charset charset = getCharset();
+
+        // Common case: charset is same as last time, so
+        // just reuse it:
+        if (lastCharset == null || !charset.equals(lastCharset)) {
+            decoder = charset.newDecoder();
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            lastCharset = charset;
+        }
+
+        return decoder;
+    }
+
+    // Return current charset in-use
+    private Charset getCharset() throws TikaException {
+        // If a specific font (fN) was set, use its charset
+        if (groupState.fontCharset != null) {
+            return groupState.fontCharset;
+        }
+
+        // Else, if global default font (defN) was set, use that one
+        if (globalDefaultFont != -1 && !inHeader) {
+            Charset cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+
+        /

<TRUNCATED>

[13/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
index 6336258..c60f955 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
@@ -1,174 +1,174 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-class KeynoteContentHandler extends DefaultHandler {
-
-    public final static String PRESENTATION_WIDTH = "slides-width";
-    public final static String PRESENTATION_HEIGHT = "slides-height";
-
-    private final XHTMLContentHandler xhtml;
-    private final Metadata metadata;
-
-    private boolean inSlide = false;
-    private boolean inTheme = false;
-    private boolean inTitle = false;
-    private boolean inBody = false;
-    private String tableId;
-    private Integer numberOfColumns = null;
-    private Integer currentColumn = null;
-
-    private boolean inMetadata = false;
-    private boolean inMetaDataTitle = false;
-    private boolean inMetaDataAuthors = false;
-
-    private boolean inParsableText = false;
-
-    private int numberOfSlides = 0;
-
-    KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
-        this.xhtml = xhtml;
-        this.metadata = metadata;
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-        metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
-    }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String qName, Attributes attributes)
-            throws SAXException {
-        if ("key:theme".equals(qName)) {
-            inTheme = true;
-        } else if ("key:slide".equals(qName)) {
-            inSlide = true;
-            numberOfSlides++;
-            xhtml.startElement("div");
-        } else if ("key:master-slide".equals(qName)) {
-            inSlide = true;
-            xhtml.startElement("div");
-        } else if ("key:title-placeholder".equals(qName) && inSlide) {
-            inTitle = true;
-            xhtml.startElement("h1");
-        } else if ("sf:sticky-note".equals(qName) && inSlide) {
-            xhtml.startElement("p");
-        } else if ("key:notes".equals(qName) && inSlide) {
-            xhtml.startElement("p");
-        } else if ("key:body-placeholder".equals(qName) && inSlide) {
-            xhtml.startElement("p");
-            inBody = true;
-        } else if ("key:size".equals(qName) && !inTheme) {
-            String width = attributes.getValue("sfa:w");
-            String height = attributes.getValue("sfa:h");
-            metadata.set(PRESENTATION_WIDTH, width);
-            metadata.set(PRESENTATION_HEIGHT, height);
-        } else if ("sf:text-body".equals(qName)) {
-            inParsableText = true;
-        } else if ("key:metadata".equals(qName)) {
-            inMetadata = true;
-        } else if (inMetadata && "key:title".equals(qName)) {
-            inMetaDataTitle = true;
-        } else if (inMetadata && "key:authors".equals(qName)) {
-            inMetaDataAuthors = true;
-        } else if (inMetaDataTitle && "key:string".equals(qName)) {
-            metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
-        } else if (inMetaDataAuthors && "key:string".equals(qName)) {
-            metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
-        } else if (inSlide && "sf:tabular-model".equals(qName)) {
-            tableId = attributes.getValue("sfa:ID");
-            xhtml.startElement("table");
-        } else if (tableId != null && "sf:columns".equals(qName)) {
-            numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
-            currentColumn = 0;
-        } else if (tableId != null && "sf:ct".equals(qName)) {
-            parseTableData(attributes.getValue("sfa:s"));
-        } else if (tableId != null && "sf:n".equals(qName)) {
-            parseTableData(attributes.getValue("sf:v"));
-        } else if ("sf:p".equals(qName)) {
-            xhtml.startElement("p");
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName)
-            throws SAXException {
-        if ("key:theme".equals(qName)) {
-            inTheme = false;
-        } else if ("key:slide".equals(qName)) {
-            inSlide = false;
-            xhtml.endElement("div");
-        } else if ("key:master-slide".equals(qName)) {
-            inSlide = false;
-            xhtml.endElement("div");
-        } else if ("key:title-placeholder".equals(qName) && inSlide) {
-            inTitle = false;
-            xhtml.endElement("h1");
-        } else if ("sf:sticky-note".equals(qName) && inSlide) {
-            xhtml.endElement("p");
-        } else if ("key:notes".equals(qName) && inSlide) {
-            xhtml.endElement("p");
-        } else if ("key:body-placeholder".equals(qName) && inSlide) {
-            xhtml.endElement("p");
-            inBody = false;
-        } else if ("sf:text-body".equals(qName)) {
-            inParsableText = false;
-        } else if ("key:metadata".equals(qName)) {
-            inMetadata = false;
-        } else if (inMetadata && "key:title".equals(qName)) {
-            inMetaDataTitle = false;
-        } else if (inMetadata && "key:authors".equals(qName)) {
-            inMetaDataAuthors = false;
-        } else if (inSlide && "sf:tabular-model".equals(qName)) {
-            xhtml.endElement("table");
-            tableId = null;
-            numberOfColumns = null;
-            currentColumn = null;
-        } else if ("sf:p".equals(qName)) {
-            xhtml.endElement("p");
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        if (inParsableText && inSlide && length != 0) {
-            xhtml.characters(ch, start, length);
-        }
-    }
-
-    private void parseTableData(String value) throws SAXException {
-      if (currentColumn == 0) {
-          xhtml.startElement("tr");
-      }
-
-      xhtml.element("td", value);
-
-      if (currentColumn.equals(numberOfColumns)) {
-          xhtml.endElement("tr");
-      }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class KeynoteContentHandler extends DefaultHandler {
+
+    public final static String PRESENTATION_WIDTH = "slides-width";
+    public final static String PRESENTATION_HEIGHT = "slides-height";
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSlide = false;
+    private boolean inTheme = false;
+    private boolean inTitle = false;
+    private boolean inBody = false;
+    private String tableId;
+    private Integer numberOfColumns = null;
+    private Integer currentColumn = null;
+
+    private boolean inMetadata = false;
+    private boolean inMetaDataTitle = false;
+    private boolean inMetaDataAuthors = false;
+
+    private boolean inParsableText = false;
+
+    private int numberOfSlides = 0;
+
+    KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = true;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = true;
+            numberOfSlides++;
+            xhtml.startElement("div");
+        } else if ("key:master-slide".equals(qName)) {
+            inSlide = true;
+            xhtml.startElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = true;
+            xhtml.startElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+            inBody = true;
+        } else if ("key:size".equals(qName) && !inTheme) {
+            String width = attributes.getValue("sfa:w");
+            String height = attributes.getValue("sfa:h");
+            metadata.set(PRESENTATION_WIDTH, width);
+            metadata.set(PRESENTATION_HEIGHT, height);
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = true;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = true;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = true;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = true;
+        } else if (inMetaDataTitle && "key:string".equals(qName)) {
+            metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
+        } else if (inMetaDataAuthors && "key:string".equals(qName)) {
+            metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            tableId = attributes.getValue("sfa:ID");
+            xhtml.startElement("table");
+        } else if (tableId != null && "sf:columns".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
+            currentColumn = 0;
+        } else if (tableId != null && "sf:ct".equals(qName)) {
+            parseTableData(attributes.getValue("sfa:s"));
+        } else if (tableId != null && "sf:n".equals(qName)) {
+            parseTableData(attributes.getValue("sf:v"));
+        } else if ("sf:p".equals(qName)) {
+            xhtml.startElement("p");
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = false;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = false;
+            xhtml.endElement("div");
+        } else if ("key:master-slide".equals(qName)) {
+            inSlide = false;
+            xhtml.endElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = false;
+            xhtml.endElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+            inBody = false;
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = false;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = false;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = false;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = false;
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            xhtml.endElement("table");
+            tableId = null;
+            numberOfColumns = null;
+            currentColumn = null;
+        } else if ("sf:p".equals(qName)) {
+            xhtml.endElement("p");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (inParsableText && inSlide && length != 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    private void parseTableData(String value) throws SAXException {
+      if (currentColumn == 0) {
+          xhtml.startElement("tr");
+      }
+
+      xhtml.element("td", value);
+
+      if (currentColumn.equals(numberOfColumns)) {
+          xhtml.endElement("tr");
+      }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
index 5dc57ae..0d3dfd1 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
@@ -1,231 +1,231 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.HashMap;
-import java.util.Map;
-
-class NumbersContentHandler extends DefaultHandler {
-
-    private final XHTMLContentHandler xhtml;
-    private final Metadata metadata;
-
-    private boolean inSheet = false;
-
-    private boolean inText = false;
-    private boolean parseText = false;
-
-    private boolean inMetadata = false;
-    private Property metadataKey;
-    private String metadataPropertyQName;
-
-    private boolean inTable = false;
-    private int numberOfSheets = 0;
-    private int numberOfColumns = -1;
-    private int currentColumn = 0;
-
-    private Map<String, String> menuItems = new HashMap<String, String>();
-    private String currentMenuItemId;
-
-    NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
-        this.xhtml = xhtml;
-        this.metadata = metadata;
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-        metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
-        if ("ls:workspace".equals(qName)) {
-            inSheet = true;
-            numberOfSheets++;
-            xhtml.startElement("div");
-            String sheetName = attributes.getValue("ls:workspace-name");
-            metadata.add("sheetNames", sheetName);
-        }
-
-        if ("sf:text".equals(qName)) {
-            inText = true;
-            xhtml.startElement("p");
-        }
-
-        if ("sf:p".equals(qName)) {
-            parseText = true;
-        }
-
-        if ("sf:metadata".equals(qName)) {
-            inMetadata = true;
-            return;
-        }
-
-        if (inMetadata && metadataKey == null) {
-            metadataKey = resolveMetadataKey(localName);
-            metadataPropertyQName = qName;
-        }
-
-        if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
-            metadata.add(metadataKey, attributes.getValue("sfa:string"));
-        }
-
-        if (!inSheet) {
-            return;
-        }
-
-        if ("sf:tabular-model".equals(qName)) {
-            String tableName = attributes.getValue("sf:name");
-            xhtml.startElement("div");
-            xhtml.characters(tableName);
-            xhtml.endElement("div");
-            inTable = true;
-            xhtml.startElement("table");
-            xhtml.startElement("tr");
-            currentColumn = 0;
-        }
-
-        if ("sf:menu-choices".equals(qName)) {
-            menuItems = new HashMap<String, String>();
-        }
-
-        if (inTable && "sf:grid".equals(qName)) {
-            numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
-        }
-
-        if (menuItems != null && "sf:t".equals(qName)) {
-            currentMenuItemId = attributes.getValue("sfa:ID");
-        }
-
-        if (currentMenuItemId != null && "sf:ct".equals(qName)) {
-            menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
-        }
-
-        if (inTable && "sf:ct".equals(qName)) {
-            if (currentColumn >= numberOfColumns) {
-                currentColumn = 0;
-                xhtml.endElement("tr");
-                xhtml.startElement("tr");
-            }
-
-            xhtml.element("td", attributes.getValue("sfa:s"));
-            currentColumn++;
-        }
-
-        if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
-            if (currentColumn >= numberOfColumns) {
-                currentColumn = 0;
-                xhtml.endElement("tr");
-                xhtml.startElement("tr");
-            }
-
-            xhtml.element("td", attributes.getValue("sf:v"));
-            currentColumn++;
-        }
-
-        if (inTable && "sf:proxied-cell-ref".equals(qName)) {
-            if (currentColumn >= numberOfColumns) {
-                currentColumn = 0;
-                xhtml.endElement("tr");
-                xhtml.startElement("tr");
-            }
-
-            xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
-            currentColumn++;
-        }
-
-        if ("sf:chart-name".equals(qName)) {
-            // Extract chart name:
-            xhtml.startElement("div", "class", "chart");
-            xhtml.startElement("h1");
-            xhtml.characters(attributes.getValue("sfa:string"));
-            xhtml.endElement("h1");
-            xhtml.endElement("div");
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (parseText && length > 0) {
-            xhtml.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if ("ls:workspace".equals(qName)) {
-            inSheet = false;
-            xhtml.endElement("div");
-        }
-
-        if ("sf:text".equals(qName)) {
-            inText = false;
-            xhtml.endElement("p");
-        }
-
-        if ("sf:p".equals(qName)) {
-            parseText = false;
-        }
-
-        if ("sf:metadata".equals(qName)) {
-            inMetadata = false;
-        }
-
-        if (inMetadata && qName.equals(metadataPropertyQName)) {
-            metadataPropertyQName = null;
-            metadataKey = null;
-        }
-
-        if (!inSheet) {
-            return;
-        }
-
-        if ("sf:menu-choices".equals(qName)) {
-        }
-
-        if ("sf:tabular-model".equals(qName)) {
-            inTable = false;
-            xhtml.endElement("tr");
-            xhtml.endElement("table");
-        }
-
-        if (currentMenuItemId != null && "sf:t".equals(qName)) {
-            currentMenuItemId = null;
-        }
-    }
-
-    private Property resolveMetadataKey(String localName) {
-        if ("authors".equals(localName)) {
-            return TikaCoreProperties.CREATOR;
-        }
-        if ("title".equals(localName)) {
-            return TikaCoreProperties.TITLE;
-        }
-        if ("comment".equals(localName)) {
-            return TikaCoreProperties.COMMENTS;
-        }
-        return Property.internalText(localName);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class NumbersContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSheet = false;
+
+    private boolean inText = false;
+    private boolean parseText = false;
+
+    private boolean inMetadata = false;
+    private Property metadataKey;
+    private String metadataPropertyQName;
+
+    private boolean inTable = false;
+    private int numberOfSheets = 0;
+    private int numberOfColumns = -1;
+    private int currentColumn = 0;
+
+    private Map<String, String> menuItems = new HashMap<String, String>();
+    private String currentMenuItemId;
+
+    NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = true;
+            numberOfSheets++;
+            xhtml.startElement("div");
+            String sheetName = attributes.getValue("ls:workspace-name");
+            metadata.add("sheetNames", sheetName);
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = true;
+            xhtml.startElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = true;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = true;
+            return;
+        }
+
+        if (inMetadata && metadataKey == null) {
+            metadataKey = resolveMetadataKey(localName);
+            metadataPropertyQName = qName;
+        }
+
+        if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
+            metadata.add(metadataKey, attributes.getValue("sfa:string"));
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            String tableName = attributes.getValue("sf:name");
+            xhtml.startElement("div");
+            xhtml.characters(tableName);
+            xhtml.endElement("div");
+            inTable = true;
+            xhtml.startElement("table");
+            xhtml.startElement("tr");
+            currentColumn = 0;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+            menuItems = new HashMap<String, String>();
+        }
+
+        if (inTable && "sf:grid".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
+        }
+
+        if (menuItems != null && "sf:t".equals(qName)) {
+            currentMenuItemId = attributes.getValue("sfa:ID");
+        }
+
+        if (currentMenuItemId != null && "sf:ct".equals(qName)) {
+            menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
+        }
+
+        if (inTable && "sf:ct".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sfa:s"));
+            currentColumn++;
+        }
+
+        if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sf:v"));
+            currentColumn++;
+        }
+
+        if (inTable && "sf:proxied-cell-ref".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
+            currentColumn++;
+        }
+
+        if ("sf:chart-name".equals(qName)) {
+            // Extract chart name:
+            xhtml.startElement("div", "class", "chart");
+            xhtml.startElement("h1");
+            xhtml.characters(attributes.getValue("sfa:string"));
+            xhtml.endElement("h1");
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (parseText && length > 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = false;
+            xhtml.endElement("div");
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = false;
+            xhtml.endElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = false;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = false;
+        }
+
+        if (inMetadata && qName.equals(metadataPropertyQName)) {
+            metadataPropertyQName = null;
+            metadataKey = null;
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            inTable = false;
+            xhtml.endElement("tr");
+            xhtml.endElement("table");
+        }
+
+        if (currentMenuItemId != null && "sf:t".equals(qName)) {
+            currentMenuItemId = null;
+        }
+    }
+
+    private Property resolveMetadataKey(String localName) {
+        if ("authors".equals(localName)) {
+            return TikaCoreProperties.CREATOR;
+        }
+        if ("title".equals(localName)) {
+            return TikaCoreProperties.TITLE;
+        }
+        if ("comment".equals(localName)) {
+            return TikaCoreProperties.COMMENTS;
+        }
+        return Property.internalText(localName);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
index b09b36f..9b45769 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
@@ -1,448 +1,448 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-class PagesContentHandler extends DefaultHandler {
-
-    private final XHTMLContentHandler xhtml;
-    private final Metadata metadata;
-
-    /** The (interesting) part of the document we're in. Should be more structured... */
-    private enum DocumentPart {
-       METADATA, PARSABLE_TEXT, 
-       HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
-       FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
-       FOOTNOTES, ANNOTATIONS;
-    }
-    private DocumentPart inPart = null;
-    private boolean ghostText;
-    
-    private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-
-    private boolean parseProperty = false;
-    private int pageCount = 0;
-    private int slPageCount = 0;
-
-    private HeaderFooter headers = null;
-    private HeaderFooter footers = null;
-    private Footnotes footnotes = null; 
-    private Annotations annotations = null; 
-    
-    private Map<String, List<List<String>>> tableData =
-        new HashMap<String, List<List<String>>>();
-    private String activeTableId;
-    private int numberOfColumns = 0;
-    private List<String> activeRow = new ArrayList<String>();
-
-    private String metaDataLocalName;
-    private String metaDataQName;
-
-    PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
-        this.xhtml = xhtml;
-        this.metadata = metadata;
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-        metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
-        if (pageCount > 0) {
-            doFooter();
-            xhtml.endElement("div");
-        }
-    }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String qName, Attributes attributes)
-            throws SAXException {
-        if (parseProperty) {
-            String value = parsePrimitiveElementValue(qName, attributes);
-            if (value != null) {
-                Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
-                if(metaDataKey instanceof Property) {
-                    metadata.set((Property)metaDataKey, value);
-                } else {
-                    metadata.add((String)metaDataKey, value);
-                }
-            }
-        }
-
-        if ("sl:publication-info".equals(qName)) {
-            inPart = DocumentPart.METADATA;
-        } else if ("sf:metadata".equals(qName)) {
-           inPart = DocumentPart.METADATA;
-        } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
-            if (pageCount > 0) {
-                doFooter();
-                xhtml.endElement("div");
-            }
-            xhtml.startElement("div");
-            if ("sl:page-group".equals(qName)) {
-                slPageCount++;
-            } else {
-                pageCount++;
-            }
-            doHeader();
-        } else if ("sf:p".equals(qName)) {
-          if (pageCount+slPageCount > 0) {
-            inPart = DocumentPart.PARSABLE_TEXT;
-            xhtml.startElement("p");
-          }
-        } else if ("sf:attachment".equals(qName)) {
-            String kind = attributes.getValue("sf:kind");
-            if ("tabular-attachment".equals(kind)) {
-                activeTableId = attributes.getValue("sfa:ID");
-                tableData.put(activeTableId, new ArrayList<List<String>>());
-            }
-        } else if ("sf:attachment-ref".equals(qName)) {
-            String idRef = attributes.getValue("sfa:IDREF");
-            outputTable(idRef);
-        } else if ("sf:headers".equals(qName)) {
-            headers = new HeaderFooter(qName);
-            inPart = DocumentPart.HEADERS;
-        } else if ("sf:footers".equals(qName)) {
-           footers = new HeaderFooter(qName);
-           inPart = DocumentPart.FOOTERS;
-        } else if ("sf:header".equals(qName)) {
-            inPart = headers.identifyPart(attributes.getValue("sf:name"));
-        } else if ("sf:footer".equals(qName)) {
-           inPart = footers.identifyPart(attributes.getValue("sf:name"));
-        } else if ("sf:page-number".equals(qName)) {	
-        	if (inPart == DocumentPart.FOOTER_ODD
-        		|| inPart == DocumentPart.FOOTER_FIRST
-        		|| inPart == DocumentPart.FOOTER_EVEN) {
-        		// We are in a footer
-        		footers.hasAutoPageNumber = true;
-        		footers.autoPageNumberFormat = attributes.getValue("sf:format");   
-        	} else {
-        		headers.hasAutoPageNumber = true;
-        		headers.autoPageNumberFormat = attributes.getValue("sf:format");   
-        	}
-
-        	xhtml.characters(Integer.toString(this.pageCount));
-        } else if ("sf:footnotes".equals(qName)) {
-           footnotes = new Footnotes();
-           inPart = DocumentPart.FOOTNOTES;
-        } else if ("sf:footnote-mark".equals(qName)) {
-           footnotes.recordMark(attributes.getValue("sf:mark"));
-        } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
-           // What about non auto-numbered?
-           String footnoteMark = attributes.getValue("sf:autonumber");
-           if (footnotes != null) {
-              String footnoteText = footnotes.footnotes.get(footnoteMark);
-              if (footnoteText != null) {
-                 xhtml.startElement("div", "style", "footnote");
-                 xhtml.characters("Footnote:" ); // As shown in Pages
-                 xhtml.characters(footnoteText);
-                 xhtml.endElement("div");
-              }
-           }
-        } else if ("sf:annotations".equals(qName)) {
-           annotations = new Annotations();
-           inPart = DocumentPart.ANNOTATIONS;
-        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
-           annotations.start(attributes.getValue("sf:target"));
-        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
-           xhtml.startElement("div", "style", "annotated");
-           
-           String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
-           if (annotationText != null) {
-              xhtml.startElement("div", "style", "annotation");
-              xhtml.characters(annotationText);
-              xhtml.endElement("div");
-           }
-        } else if ("sf:ghost-text".equals(qName)) {
-            ghostText = true;
-        }
-
-        if (activeTableId != null) {
-            parseTableData(qName, attributes);
-        }
-
-        if (inPart == DocumentPart.METADATA) {
-            metaDataLocalName = localName;
-            metaDataQName = qName;
-            parseProperty = true;
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName)
-            throws SAXException {
-        if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
-            metaDataLocalName = null;
-            parseProperty = false;
-        }
-
-        if ("sl:publication-info".equals(qName)) {
-            inPart = null;
-        } else if ("sf:metadata".equals(qName)) {
-            inPart = null;
-        } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
-            inPart = null;
-            xhtml.endElement("p");
-        } else if ("sf:attachment".equals(qName)) {
-            activeTableId = null;
-        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
-            annotations.end();
-        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
-            xhtml.endElement("div");
-        } else if ("sf:ghost-text".equals(qName)) {
-            ghostText = false;
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (length > 0) {
-           if (inPart == DocumentPart.PARSABLE_TEXT) {
-               if (!ghostText) {
-                   xhtml.characters(ch, start, length);
-               }
-          } else if(inPart != null) {
-              String str = new String(ch, start, length);
-              if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
-              if (inPart == DocumentPart.HEADER_EVEN)  headers.defaultEven = str;
-              if (inPart == DocumentPart.HEADER_ODD)   headers.defaultOdd = str;
-              if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
-              if (inPart == DocumentPart.FOOTER_EVEN)  footers.defaultEven = str;
-              if (inPart == DocumentPart.FOOTER_ODD)   footers.defaultOdd = str;
-              if (inPart == DocumentPart.FOOTNOTES)    footnotes.text(str);
-              if (inPart == DocumentPart.ANNOTATIONS)  annotations.text(str);
-          }
-        }
-    }
-
-    private void parseTableData(String qName, Attributes attributes) {
-        if ("sf:grid".equals(qName)) {
-            String numberOfColumns = attributes.getValue("sf:numcols");
-            this.numberOfColumns = Integer.parseInt(numberOfColumns);
-        } else if ("sf:ct".equals(qName)) {
-            activeRow.add(attributes.getValue("sfa:s"));
-
-            if (activeRow.size() >= 3) {
-                tableData.get(activeTableId).add(activeRow);
-                activeRow = new ArrayList<String>();
-            }
-        }
-    }
-
-    private void outputTable(String idRef) throws SAXException {
-        List<List<String>> tableData = this.tableData.get(idRef);
-        if (tableData != null) {
-            xhtml.startElement("table");
-            for (List<String> row : tableData) {
-                xhtml.startElement("tr");
-                for (String cell : row) {
-                    xhtml.element("td", cell);
-                }
-                xhtml.endElement("tr");
-            }
-            xhtml.endElement("table");
-        }
-    }
-
-    /**
-     * Returns a resolved key that is common in other document types or
-     * returns the specified metaDataLocalName if no common key could be found.
-     * The key could be a simple String key, or could be a {@link Property}
-     *
-     * @param metaDataLocalName The localname of the element containing metadata
-     * @return a resolved key that is common in other document types
-     */
-    private Object resolveMetaDataKey(String metaDataLocalName) {
-        Object metaDataKey = metaDataLocalName;
-        if ("sf:authors".equals(metaDataQName)) {
-            metaDataKey = TikaCoreProperties.CREATOR;
-        } else if ("sf:title".equals(metaDataQName)) {
-            metaDataKey = TikaCoreProperties.TITLE;
-        } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
-            metaDataKey = TikaCoreProperties.CREATED;
-        } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
-            metaDataKey = Metadata.LAST_MODIFIED;
-        } else if ("sl:language".equals(metaDataQName)) {
-            metaDataKey = TikaCoreProperties.LANGUAGE;
-        }
-        return metaDataKey;
-    }
-
-    /**
-     * Returns the value of a primitive element e.g.:
-     * &lt;sl:number sfa:number="0" sfa:type="f"/&gt; - the number attribute
-     * &lt;sl:string sfa:string="en"/&gt; = the string attribute
-     * <p>
-     * Returns <code>null</code> if the value could not be extracted from
-     * the list of attributes.
-     *
-     * @param qName      The fully qualified name of the element containing
-     *                   the value to extract
-     * @param attributes The list of attributes of which one contains the
-     *                   value to be extracted
-     * @return the value of a primitive element
-     */
-    private String parsePrimitiveElementValue(
-            String qName, Attributes attributes) {
-        if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
-            return attributes.getValue("sfa:string");
-        } else if ("sl:number".equals(qName)) {
-            return attributes.getValue("sfa:number");
-        } else if ("sl:date".equals(qName)) {
-            return attributes.getValue("sf:val");
-        }
-
-        return null;
-    }
-    
-    private void doHeader() throws SAXException {
-       if (headers != null) {
-          headers.output("header");
-       }
-    }
-    private void doFooter() throws SAXException {
-       if (footers != null) {
-          footers.output("footer");
-       }
-    }
-
-    /**
-     * Represents the Headers or Footers in a document
-     */
-    private class HeaderFooter {
-       private String type; // sf:headers or sf:footers
-       private String defaultOdd;
-       private String defaultEven;
-       private String defaultFirst;
-       private boolean hasAutoPageNumber;
-       private String autoPageNumberFormat;
-       // TODO Can there be custom ones?
-       
-       private HeaderFooter(String type) {
-          this.type = type; 
-       }
-       private DocumentPart identifyPart(String name) {
-          if("SFWPDefaultOddHeaderIdentifier".equals(name))
-             return DocumentPart.HEADER_ODD;
-          if("SFWPDefaultEvenHeaderIdentifier".equals(name))
-             return DocumentPart.HEADER_EVEN;
-          if("SFWPDefaultFirstHeaderIdentifier".equals(name))
-             return DocumentPart.HEADER_FIRST;
-          
-          if("SFWPDefaultOddFooterIdentifier".equals(name))
-             return DocumentPart.FOOTER_ODD;
-          if("SFWPDefaultEvenFooterIdentifier".equals(name))
-             return DocumentPart.FOOTER_EVEN;
-          if("SFWPDefaultFirstFooterIdentifier".equals(name))
-             return DocumentPart.FOOTER_FIRST;
-          
-          return null;
-       }
-       private void output(String what) throws SAXException {
-          String text = null;
-          if (pageCount == 1 && defaultFirst != null) {
-             text = defaultFirst;
-          } else if (pageCount % 2 == 0 && defaultEven != null) {
-             text = defaultEven;
-          } else {
-             text = defaultOdd;
-          }
-          
-          if (text != null) {
-             xhtml.startElement("div", "class", "header");
-             xhtml.characters(text);
-             if (hasAutoPageNumber) {
-            	 if (autoPageNumberFormat == null) { // raw number
-            		 xhtml.characters("\t" + pageCount);
-            	 } else if (autoPageNumberFormat.equals("upper-roman")){
-            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
-            	 } else if (autoPageNumberFormat.equals("lower-roman")){
-            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
-            	 } else if (autoPageNumberFormat.equals("upper-alpha")){
-            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
-            	 } else if (autoPageNumberFormat.equals("lower-alpha")){
-            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
-            	 }
-             }
-             xhtml.endElement("div");
-          }
-       }
-    }
-    /**
-     * Represents Footnotes in a document. The way these work
-     *  in the file format isn't very clean...
-     */
-    private static class Footnotes {
-       /** Mark -> Text */
-       Map<String,String> footnotes = new HashMap<String, String>();
-       String lastSeenMark = null;
-       
-       /**
-        * Normally happens before the text of the mark
-        */
-       private void recordMark(String mark) {
-          lastSeenMark = mark;
-       }
-       private void text(String text) {
-          if (lastSeenMark != null) {
-             if (footnotes.containsKey(lastSeenMark)) {
-                text = footnotes.get(lastSeenMark) + text;
-             }
-             footnotes.put(lastSeenMark, text);
-          }
-       }
-    }
-    /**
-     * Represents Annotations in a document. We currently
-     *  just grab all the sf:p text in each one 
-     */
-    private class Annotations {
-       /** ID -> Text */
-       Map<String,String> annotations = new HashMap<String, String>();
-       String currentID = null;
-       StringBuffer currentText = null;
-       
-       private void start(String id) {
-          currentID = id;
-          currentText = new StringBuffer();
-       }
-       private void text(String text) {
-          if (text != null && text.length() > 0 && currentText != null) {
-             currentText.append(text);
-          }
-       }
-       private void end() {
-          if (currentText.length() > 0) {
-             annotations.put(currentID, currentText.toString());
-             currentID = null;
-             currentText = null;
-          }
-       }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class PagesContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    /** The (interesting) part of the document we're in. Should be more structured... */
+    private enum DocumentPart {
+       METADATA, PARSABLE_TEXT, 
+       HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+       FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+       FOOTNOTES, ANNOTATIONS;
+    }
+    private DocumentPart inPart = null;
+    private boolean ghostText;
+    
+    private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+    private boolean parseProperty = false;
+    private int pageCount = 0;
+    private int slPageCount = 0;
+
+    private HeaderFooter headers = null;
+    private HeaderFooter footers = null;
+    private Footnotes footnotes = null; 
+    private Annotations annotations = null; 
+    
+    private Map<String, List<List<String>>> tableData =
+        new HashMap<String, List<List<String>>>();
+    private String activeTableId;
+    private int numberOfColumns = 0;
+    private List<String> activeRow = new ArrayList<String>();
+
+    private String metaDataLocalName;
+    private String metaDataQName;
+
+    PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+        if (pageCount > 0) {
+            doFooter();
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if (parseProperty) {
+            String value = parsePrimitiveElementValue(qName, attributes);
+            if (value != null) {
+                Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
+                if(metaDataKey instanceof Property) {
+                    metadata.set((Property)metaDataKey, value);
+                } else {
+                    metadata.add((String)metaDataKey, value);
+                }
+            }
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inPart = DocumentPart.METADATA;
+        } else if ("sf:metadata".equals(qName)) {
+           inPart = DocumentPart.METADATA;
+        } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
+            if (pageCount > 0) {
+                doFooter();
+                xhtml.endElement("div");
+            }
+            xhtml.startElement("div");
+            if ("sl:page-group".equals(qName)) {
+                slPageCount++;
+            } else {
+                pageCount++;
+            }
+            doHeader();
+        } else if ("sf:p".equals(qName)) {
+          if (pageCount+slPageCount > 0) {
+            inPart = DocumentPart.PARSABLE_TEXT;
+            xhtml.startElement("p");
+          }
+        } else if ("sf:attachment".equals(qName)) {
+            String kind = attributes.getValue("sf:kind");
+            if ("tabular-attachment".equals(kind)) {
+                activeTableId = attributes.getValue("sfa:ID");
+                tableData.put(activeTableId, new ArrayList<List<String>>());
+            }
+        } else if ("sf:attachment-ref".equals(qName)) {
+            String idRef = attributes.getValue("sfa:IDREF");
+            outputTable(idRef);
+        } else if ("sf:headers".equals(qName)) {
+            headers = new HeaderFooter(qName);
+            inPart = DocumentPart.HEADERS;
+        } else if ("sf:footers".equals(qName)) {
+           footers = new HeaderFooter(qName);
+           inPart = DocumentPart.FOOTERS;
+        } else if ("sf:header".equals(qName)) {
+            inPart = headers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:footer".equals(qName)) {
+           inPart = footers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:page-number".equals(qName)) {	
+        	if (inPart == DocumentPart.FOOTER_ODD
+        		|| inPart == DocumentPart.FOOTER_FIRST
+        		|| inPart == DocumentPart.FOOTER_EVEN) {
+        		// We are in a footer
+        		footers.hasAutoPageNumber = true;
+        		footers.autoPageNumberFormat = attributes.getValue("sf:format");   
+        	} else {
+        		headers.hasAutoPageNumber = true;
+        		headers.autoPageNumberFormat = attributes.getValue("sf:format");   
+        	}
+
+        	xhtml.characters(Integer.toString(this.pageCount));
+        } else if ("sf:footnotes".equals(qName)) {
+           footnotes = new Footnotes();
+           inPart = DocumentPart.FOOTNOTES;
+        } else if ("sf:footnote-mark".equals(qName)) {
+           footnotes.recordMark(attributes.getValue("sf:mark"));
+        } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           // What about non auto-numbered?
+           String footnoteMark = attributes.getValue("sf:autonumber");
+           if (footnotes != null) {
+              String footnoteText = footnotes.footnotes.get(footnoteMark);
+              if (footnoteText != null) {
+                 xhtml.startElement("div", "style", "footnote");
+                 xhtml.characters("Footnote:" ); // As shown in Pages
+                 xhtml.characters(footnoteText);
+                 xhtml.endElement("div");
+              }
+           }
+        } else if ("sf:annotations".equals(qName)) {
+           annotations = new Annotations();
+           inPart = DocumentPart.ANNOTATIONS;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+           annotations.start(attributes.getValue("sf:target"));
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           xhtml.startElement("div", "style", "annotated");
+           
+           String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+           if (annotationText != null) {
+              xhtml.startElement("div", "style", "annotation");
+              xhtml.characters(annotationText);
+              xhtml.endElement("div");
+           }
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = true;
+        }
+
+        if (activeTableId != null) {
+            parseTableData(qName, attributes);
+        }
+
+        if (inPart == DocumentPart.METADATA) {
+            metaDataLocalName = localName;
+            metaDataQName = qName;
+            parseProperty = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+            metaDataLocalName = null;
+            parseProperty = false;
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inPart = null;
+        } else if ("sf:metadata".equals(qName)) {
+            inPart = null;
+        } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
+            inPart = null;
+            xhtml.endElement("p");
+        } else if ("sf:attachment".equals(qName)) {
+            activeTableId = null;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+            annotations.end();
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+            xhtml.endElement("div");
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (length > 0) {
+           if (inPart == DocumentPart.PARSABLE_TEXT) {
+               if (!ghostText) {
+                   xhtml.characters(ch, start, length);
+               }
+          } else if(inPart != null) {
+              String str = new String(ch, start, length);
+              if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+              if (inPart == DocumentPart.HEADER_EVEN)  headers.defaultEven = str;
+              if (inPart == DocumentPart.HEADER_ODD)   headers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+              if (inPart == DocumentPart.FOOTER_EVEN)  footers.defaultEven = str;
+              if (inPart == DocumentPart.FOOTER_ODD)   footers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTNOTES)    footnotes.text(str);
+              if (inPart == DocumentPart.ANNOTATIONS)  annotations.text(str);
+          }
+        }
+    }
+
+    private void parseTableData(String qName, Attributes attributes) {
+        if ("sf:grid".equals(qName)) {
+            String numberOfColumns = attributes.getValue("sf:numcols");
+            this.numberOfColumns = Integer.parseInt(numberOfColumns);
+        } else if ("sf:ct".equals(qName)) {
+            activeRow.add(attributes.getValue("sfa:s"));
+
+            if (activeRow.size() >= 3) {
+                tableData.get(activeTableId).add(activeRow);
+                activeRow = new ArrayList<String>();
+            }
+        }
+    }
+
+    private void outputTable(String idRef) throws SAXException {
+        List<List<String>> tableData = this.tableData.get(idRef);
+        if (tableData != null) {
+            xhtml.startElement("table");
+            for (List<String> row : tableData) {
+                xhtml.startElement("tr");
+                for (String cell : row) {
+                    xhtml.element("td", cell);
+                }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("table");
+        }
+    }
+
+    /**
+     * Returns a resolved key that is common in other document types or
+     * returns the specified metaDataLocalName if no common key could be found.
+     * The key could be a simple String key, or could be a {@link Property}
+     *
+     * @param metaDataLocalName The localname of the element containing metadata
+     * @return a resolved key that is common in other document types
+     */
+    private Object resolveMetaDataKey(String metaDataLocalName) {
+        Object metaDataKey = metaDataLocalName;
+        if ("sf:authors".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.CREATOR;
+        } else if ("sf:title".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.TITLE;
+        } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.CREATED;
+        } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+            metaDataKey = Metadata.LAST_MODIFIED;
+        } else if ("sl:language".equals(metaDataQName)) {
+            metaDataKey = TikaCoreProperties.LANGUAGE;
+        }
+        return metaDataKey;
+    }
+
+    /**
+     * Returns the value of a primitive element e.g.:
+     * &lt;sl:number sfa:number="0" sfa:type="f"/&gt; - the number attribute
+     * &lt;sl:string sfa:string="en"/&gt; = the string attribute
+     * <p>
+     * Returns <code>null</code> if the value could not be extracted from
+     * the list of attributes.
+     *
+     * @param qName      The fully qualified name of the element containing
+     *                   the value to extract
+     * @param attributes The list of attributes of which one contains the
+     *                   value to be extracted
+     * @return the value of a primitive element
+     */
+    private String parsePrimitiveElementValue(
+            String qName, Attributes attributes) {
+        if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+            return attributes.getValue("sfa:string");
+        } else if ("sl:number".equals(qName)) {
+            return attributes.getValue("sfa:number");
+        } else if ("sl:date".equals(qName)) {
+            return attributes.getValue("sf:val");
+        }
+
+        return null;
+    }
+    
+    private void doHeader() throws SAXException {
+       if (headers != null) {
+          headers.output("header");
+       }
+    }
+    private void doFooter() throws SAXException {
+       if (footers != null) {
+          footers.output("footer");
+       }
+    }
+
+    /**
+     * Represents the Headers or Footers in a document
+     */
+    private class HeaderFooter {
+       private String type; // sf:headers or sf:footers
+       private String defaultOdd;
+       private String defaultEven;
+       private String defaultFirst;
+       private boolean hasAutoPageNumber;
+       private String autoPageNumberFormat;
+       // TODO Can there be custom ones?
+       
+       private HeaderFooter(String type) {
+          this.type = type; 
+       }
+       private DocumentPart identifyPart(String name) {
+          if("SFWPDefaultOddHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_ODD;
+          if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_EVEN;
+          if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_FIRST;
+          
+          if("SFWPDefaultOddFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_ODD;
+          if("SFWPDefaultEvenFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_EVEN;
+          if("SFWPDefaultFirstFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_FIRST;
+          
+          return null;
+       }
+       private void output(String what) throws SAXException {
+          String text = null;
+          if (pageCount == 1 && defaultFirst != null) {
+             text = defaultFirst;
+          } else if (pageCount % 2 == 0 && defaultEven != null) {
+             text = defaultEven;
+          } else {
+             text = defaultOdd;
+          }
+          
+          if (text != null) {
+             xhtml.startElement("div", "class", "header");
+             xhtml.characters(text);
+             if (hasAutoPageNumber) {
+            	 if (autoPageNumberFormat == null) { // raw number
+            		 xhtml.characters("\t" + pageCount);
+            	 } else if (autoPageNumberFormat.equals("upper-roman")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
+            	 } else if (autoPageNumberFormat.equals("lower-roman")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
+            	 } else if (autoPageNumberFormat.equals("upper-alpha")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
+            	 } else if (autoPageNumberFormat.equals("lower-alpha")){
+            		 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
+            	 }
+             }
+             xhtml.endElement("div");
+          }
+       }
+    }
+    /**
+     * Represents Footnotes in a document. The way these work
+     *  in the file format isn't very clean...
+     */
+    private static class Footnotes {
+       /** Mark -> Text */
+       Map<String,String> footnotes = new HashMap<String, String>();
+       String lastSeenMark = null;
+       
+       /**
+        * Normally happens before the text of the mark
+        */
+       private void recordMark(String mark) {
+          lastSeenMark = mark;
+       }
+       private void text(String text) {
+          if (lastSeenMark != null) {
+             if (footnotes.containsKey(lastSeenMark)) {
+                text = footnotes.get(lastSeenMark) + text;
+             }
+             footnotes.put(lastSeenMark, text);
+          }
+       }
+    }
+    /**
+     * Represents Annotations in a document. We currently
+     *  just grab all the sf:p text in each one 
+     */
+    private class Annotations {
+       /** ID -> Text */
+       Map<String,String> annotations = new HashMap<String, String>();
+       String currentID = null;
+       StringBuffer currentText = null;
+       
+       private void start(String id) {
+          currentID = id;
+          currentText = new StringBuffer();
+       }
+       private void text(String text) {
+          if (text != null && text.length() > 0 && currentText != null) {
+             currentText.append(text);
+          }
+       }
+       private void end() {
+          if (currentText.length() > 0) {
+             annotations.put(currentID, currentText.toString());
+             currentID = null;
+             currentText = null;
+          }
+       }
+    }
+
+}

[30/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
index 98970d9..83d72c9 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
@@ -1,162 +1,162 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class ImageParserTest {
-
-    private final Parser parser = new ImageParser();
-
-    @Test
-    public void testBMP() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/bmp");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testBMP.bmp");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals("75", metadata.get("height"));
-        assertEquals("100", metadata.get("width"));
-        assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
-        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
-        //TODO: figure out why we're getting 0.35273367 in Ubuntu, but not Windows
-        //assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
-        //assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
-        assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
-        assertEquals("image/bmp", metadata.get("Content-Type"));
-
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
-    }
-
-    @Test
-    public void testGIF() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/gif");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testGIF.gif");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals("75", metadata.get("height"));
-        assertEquals("100", metadata.get("width"));
-        assertEquals("true", metadata.get("Compression Lossless"));
-        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
-        assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
-        assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
-        assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
-        assertEquals("Index", metadata.get("Data SampleFormat"));
-        assertEquals("3", metadata.get("Chroma NumChannels"));
-        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
-        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
-        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
-        assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
-        assertEquals("true", metadata.get("Chroma BlackIsZero"));
-        assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
-        assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
-        assertEquals("image/gif", metadata.get("Content-Type"));
-
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
-    }
-
-    @Test
-    public void testJPEG() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals("75", metadata.get("height"));
-        assertEquals("100", metadata.get("width"));
-        assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
-        assertEquals("false", metadata.get("Compression Lossless"));
-        assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
-        assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
-        assertEquals("225", metadata.get("markerSequence unknown"));
-        assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
-        assertEquals("normal", metadata.get("Dimension ImageOrientation"));
-        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
-        assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
-        assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
-        assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
-        assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
-        assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
-        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
-        assertEquals("3", metadata.get("Chroma NumChannels"));
-        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
-        assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
-        assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
-        assertEquals("image/jpeg", metadata.get("Content-Type"));
-        assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
-
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
-    }
-
-    @Test
-    public void testPNG() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/png");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testPNG.png");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals("75", metadata.get("height"));
-        assertEquals("100", metadata.get("width"));
-        assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
-        assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
-        assertEquals("Perceptual", metadata.get("sRGB"));
-        assertEquals("true", metadata.get("Compression Lossless"));
-        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
-        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
-        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
-        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
-        assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
-        assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
-        assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
-        assertEquals("none", metadata.get("Transparency Alpha"));
-        assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
-        assertEquals("3", metadata.get("Chroma NumChannels"));
-        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
-        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
-        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
-        assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
-        assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
-        assertEquals("true", metadata.get("Chroma BlackIsZero"));
-        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
-        assertEquals("image/png", metadata.get("Content-Type"));
-
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class ImageParserTest {
+
+    private final Parser parser = new ImageParser();
+
+    @Test
+    public void testBMP() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/bmp");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testBMP.bmp");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("75", metadata.get("height"));
+        assertEquals("100", metadata.get("width"));
+        assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        //TODO: figure out why we're getting 0.35273367 in Ubuntu, but not Windows
+        //assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
+        //assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
+        assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
+        assertEquals("image/bmp", metadata.get("Content-Type"));
+
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+    }
+
+    @Test
+    public void testGIF() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/gif");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testGIF.gif");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("75", metadata.get("height"));
+        assertEquals("100", metadata.get("width"));
+        assertEquals("true", metadata.get("Compression Lossless"));
+        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
+        assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
+        assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
+        assertEquals("Index", metadata.get("Data SampleFormat"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
+        assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+        assertEquals("true", metadata.get("Chroma BlackIsZero"));
+        assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
+        assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
+        assertEquals("image/gif", metadata.get("Content-Type"));
+
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
+    }
+
+    @Test
+    public void testJPEG() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("75", metadata.get("height"));
+        assertEquals("100", metadata.get("width"));
+        assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
+        assertEquals("false", metadata.get("Compression Lossless"));
+        assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
+        assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
+        assertEquals("225", metadata.get("markerSequence unknown"));
+        assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
+        assertEquals("normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
+        assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
+        assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
+        assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
+        assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
+        assertEquals("image/jpeg", metadata.get("Content-Type"));
+        assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
+
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
+    }
+
+    @Test
+    public void testPNG() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/png");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testPNG.png");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("75", metadata.get("height"));
+        assertEquals("100", metadata.get("width"));
+        assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
+        assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+        assertEquals("Perceptual", metadata.get("sRGB"));
+        assertEquals("true", metadata.get("Compression Lossless"));
+        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
+        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
+        assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
+        assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
+        assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
+        assertEquals("none", metadata.get("Transparency Alpha"));
+        assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+        assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
+        assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
+        assertEquals("true", metadata.get("Chroma BlackIsZero"));
+        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
+        assertEquals("image/png", metadata.get("Content-Type"));
+
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
index b78a831..7e3a123 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.metadata.TIFF;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class MetadataFieldsTest {
-
-    @Test
-    public void testIsMetadataField() {
-        assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
-        assertFalse(MetadataFields.isMetadataField("xyz"));
-        assertTrue(MetadataFields.isMetadataField(TikaCoreProperties.KEYWORDS));
-        assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class MetadataFieldsTest {
+
+    @Test
+    public void testIsMetadataField() {
+        assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
+        assertFalse(MetadataFields.isMetadataField("xyz"));
+        assertTrue(MetadataFields.isMetadataField(TikaCoreProperties.KEYWORDS));
+        assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
index 239c160..d506c33 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
@@ -1,66 +1,66 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TiffParserTest {
-    private final Parser parser = new TiffParser();
-
-    @Test
-    public void testTIFF() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testTIFF.tif");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
-                "more contributor license agreements.  See the NOTICE file " +
-                "distributed with this work for additional information regarding " +
-                "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION));
-
-        // All EXIF/TIFF tags
-        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
-        // Core EXIF/TIFF tags
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
-        assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
-        // Embedded XMP
-        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue("got " + keywords, keywords.contains("cat"));
-        assertTrue("got " + keywords, keywords.contains("garden"));
-        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
-        assertTrue("got " + subject, subject.contains("cat"));
-        assertTrue("got " + subject, subject.contains("garden"));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TiffParserTest {
+    private final Parser parser = new TiffParser();
+
+    @Test
+    public void testTIFF() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testTIFF.tif");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
+                "more contributor license agreements.  See the NOTICE file " +
+                "distributed with this work for additional information regarding " +
+                "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION));
+
+        // All EXIF/TIFF tags
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+        // Embedded XMP
+        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue("got " + keywords, keywords.contains("cat"));
+        assertTrue("got " + keywords, keywords.contains("garden"));
+        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("got " + subject, subject.contains("cat"));
+        assertTrue("got " + subject, subject.contains("garden"));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index f65c797..b189fd7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -1,284 +1,284 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.jpeg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.TimeZone;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TIFF;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPMM;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class JpegParserTest {
-
-    private final Parser parser = new JpegParser();
-    static TimeZone CURR_TIME_ZONE = TimeZone.getDefault();
-
-    //As of Drew Noakes' metadata-extractor 2.8.1,
-    //unspecified timezones appear to be set to
-    //TimeZone.getDefault().  We need to normalize this
-    //for testing across different time zones.
-    //We also appear to have to specify it in the surefire config:
-    //<argLine>-Duser.timezone=UTC</argLine>
-    @BeforeClass
-    public static void setDefaultTimeZone() {
-        TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
-    }
-    @AfterClass
-    public static void resetDefaultTimeZone() {
-        TimeZone.setDefault(CURR_TIME_ZONE);
-    }
-    @Test
-    public void testJPEG() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // Core EXIF/TIFF tags
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
-        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
-        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
-        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
-        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
-        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
-        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
-        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
-        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
-        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
-        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
-        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
-        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
-        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
-        // Check that EXIF/TIFF tags come through with their raw values too
-        // (This may be removed for Tika 1.0, as we support more of them
-        //  with explicit Metadata entries)
-        assertEquals("Canon EOS 40D", metadata.get("Model"));
-
-        // Common tags
-        assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
-        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
-                "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
-        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
-        assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
-        assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
-        assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
-        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
-        assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
-        assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
-        assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
-        assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
-    }
-
-    /**
-     * Test for a file with Geographic information (lat, long etc) in it
-     */
-    @Test
-    public void testJPEGGeo() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // Geo tags
-        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
-        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
-
-        // Core EXIF/TIFF tags
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
-        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
-        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
-        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
-        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
-        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
-        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
-        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
-        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
-        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
-        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
-        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
-        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
-        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
-        // Common tags
-        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
-                "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
-                "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
-        assertEquals("Date/Time Original should be stored in EXIF field too",
-                "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
-        assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
-        assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
-    }
-
-    /**
-     * Test for an image with the geographic information stored in a slightly
-     * different way, see TIKA-915 for details
-     * Disabled for now, pending a fix to the underlying library
-     */
-    @Test
-    public void testJPEGGeo2() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // Geo tags should be there with 5dp, and not rounded
-        assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
-        assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
-    }
-
-    @Test
-    public void testJPEGTitleAndDescription() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // embedded comments with non-ascii characters
-        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
-        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
-        // but we have to replace them with underscore
-
-        List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
-        assertTrue(keywords.contains("coast"));
-        assertTrue(keywords.contains("bird watching"));
-        assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
-
-        // Core EXIF/TIFF tags
-        assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
-        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
-        assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
-        assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
-        assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
-        assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
-        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
-        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
-        assertEquals(null, metadata.get(Metadata.SOFTWARE));
-        assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
-        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
-        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
-        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-    }
-
-    @Test
-    public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // embedded comments with non-ascii characters
-        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
-        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue("got " + keywords, keywords.contains("bird watching"));
-        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
-        assertTrue("got " + subject, subject.contains("bird watching"));
-    }
-
-    @Test
-    public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        // XnViewMp's default comment dialog has only comment, not headline.
-        // Comment is embedded only if "Write comments in XMP" is enabled in settings
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
-        // but we have to replace them with underscore
-        String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
-        List<String> keywords = Arrays.asList(subject);
-        assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
-        assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
-    }
-
-    @Test
-    public void testJPEGoddTagComponent() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
-        assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
-    }
-
-    @Test
-    public void testJPEGEmptyEXIFDateTime() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
-        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
-    }
-
-    @Test
-    public void testJPEGXMPMM() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-        InputStream stream =
-                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
-        //TODO: when jempbox is fixed/xmpbox is used
-        //add tests for history...currently not extracted
-        assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
-                metadata.get(XMPMM.DOCUMENTID));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TimeZone;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class JpegParserTest {
+
+    private final Parser parser = new JpegParser();
+    static TimeZone CURR_TIME_ZONE = TimeZone.getDefault();
+
+    //As of Drew Noakes' metadata-extractor 2.8.1,
+    //unspecified timezones appear to be set to
+    //TimeZone.getDefault().  We need to normalize this
+    //for testing across different time zones.
+    //We also appear to have to specify it in the surefire config:
+    //<argLine>-Duser.timezone=UTC</argLine>
+    @BeforeClass
+    public static void setDefaultTimeZone() {
+        TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
+    }
+    @AfterClass
+    public static void resetDefaultTimeZone() {
+        TimeZone.setDefault(CURR_TIME_ZONE);
+    }
+    @Test
+    public void testJPEG() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
+        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
+        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
+        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
+        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
+        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
+        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
+        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
+        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
+        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+        // Check that EXIF/TIFF tags come through with their raw values too
+        // (This may be removed for Tika 1.0, as we support more of them
+        //  with explicit Metadata entries)
+        assertEquals("Canon EOS 40D", metadata.get("Model"));
+
+        // Common tags
+        assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+                "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
+        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
+        assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
+        assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
+        assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
+        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
+        assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
+        assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
+        assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
+    }
+
+    /**
+     * Test for a file with Geographic information (lat, long etc) in it
+     */
+    @Test
+    public void testJPEGGeo() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // Geo tags
+        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
+        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
+        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
+        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
+        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
+        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
+        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
+        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
+        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
+        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+        // Common tags
+        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+                "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
+                "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+        assertEquals("Date/Time Original should be stored in EXIF field too",
+                "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
+        assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+        assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
+    }
+
+    /**
+     * Test for an image with the geographic information stored in a slightly
+     * different way, see TIKA-915 for details
+     * Disabled for now, pending a fix to the underlying library
+     */
+    @Test
+    public void testJPEGGeo2() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // Geo tags should be there with 5dp, and not rounded
+        assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
+        assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
+    }
+
+    @Test
+    public void testJPEGTitleAndDescription() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // embedded comments with non-ascii characters
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
+        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+        // but we have to replace them with underscore
+
+        List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
+        assertTrue(keywords.contains("coast"));
+        assertTrue(keywords.contains("bird watching"));
+        assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
+
+        // Core EXIF/TIFF tags
+        assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+        assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
+        assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
+        assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
+        assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
+        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
+        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
+        assertEquals(null, metadata.get(Metadata.SOFTWARE));
+        assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
+        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+    }
+
+    @Test
+    public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // embedded comments with non-ascii characters
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue("got " + keywords, keywords.contains("bird watching"));
+        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("got " + subject, subject.contains("bird watching"));
+    }
+
+    @Test
+    public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        // XnViewMp's default comment dialog has only comment, not headline.
+        // Comment is embedded only if "Write comments in XMP" is enabled in settings
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+        // but we have to replace them with underscore
+        String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
+        List<String> keywords = Arrays.asList(subject);
+        assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
+        assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
+    }
+
+    @Test
+    public void testJPEGoddTagComponent() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
+    }
+
+    @Test
+    public void testJPEGEmptyEXIFDateTime() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
+        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
+    }
+
+    @Test
+    public void testJPEGXMPMM() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        //TODO: when jempbox is fixed/xmpbox is used
+        //add tests for history...currently not extracted
+        assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
+                metadata.get(XMPMM.DOCUMENTID));
+    }
+
+}

[18/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
index 65894e3..e337c15 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
@@ -1,85 +1,85 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests public methods of the DirectoryListingEntry class
- * 
- * @author olegt
- * 
- */
-public class TestDirectoryListingEntry {
-    private DirectoryListingEntry dle = null;
-
-    @Before
-    public void setUp() throws Exception {
-        dle = new DirectoryListingEntry(TestParameters.nameLength,
-                TestParameters.entryName, TestParameters.entryType,
-                TestParameters.offset, TestParameters.length);
-    }
-
-    @Test
-    public void testDefaultConstructor() {
-        assertNotNull(dle);
-    }
-
-    @Test
-    public void testParamConstructor() {
-        assertEquals(TestParameters.nameLength, dle.getNameLength());
-        assertEquals(TestParameters.entryName, dle.getName());
-        assertEquals(TestParameters.entryType, dle.getEntryType());
-        assertEquals(TestParameters.offset, dle.getOffset());
-        assertEquals(TestParameters.length, dle.getLength());
-    }
-
-    @Test
-    public void testToString() {
-        assertNotNull(dle.toString());
-    }
-
-    @Test
-    public void testGetNameLength() {
-        assertEquals(TestParameters.nameLength, dle.getNameLength());
-    }
-
-    @Test
-    public void testGetName() {
-        assertEquals(TestParameters.entryName, dle.getName());
-    }
-
-    @Test
-    public void testGetEntryType() {
-        assertEquals(TestParameters.entryType, dle.getEntryType());
-    }
-
-    @Test
-    public void testGetOffset() {
-        assertEquals(TestParameters.offset, dle.getOffset());
-    }
-
-    @Test
-    public void testGetLength() {
-        assertEquals(TestParameters.length, dle.getLength());
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests public methods of the DirectoryListingEntry class
+ * 
+ * @author olegt
+ * 
+ */
+public class TestDirectoryListingEntry {
+    private DirectoryListingEntry dle = null;
+
+    @Before
+    public void setUp() throws Exception {
+        dle = new DirectoryListingEntry(TestParameters.nameLength,
+                TestParameters.entryName, TestParameters.entryType,
+                TestParameters.offset, TestParameters.length);
+    }
+
+    @Test
+    public void testDefaultConstructor() {
+        assertNotNull(dle);
+    }
+
+    @Test
+    public void testParamConstructor() {
+        assertEquals(TestParameters.nameLength, dle.getNameLength());
+        assertEquals(TestParameters.entryName, dle.getName());
+        assertEquals(TestParameters.entryType, dle.getEntryType());
+        assertEquals(TestParameters.offset, dle.getOffset());
+        assertEquals(TestParameters.length, dle.getLength());
+    }
+
+    @Test
+    public void testToString() {
+        assertNotNull(dle.toString());
+    }
+
+    @Test
+    public void testGetNameLength() {
+        assertEquals(TestParameters.nameLength, dle.getNameLength());
+    }
+
+    @Test
+    public void testGetName() {
+        assertEquals(TestParameters.entryName, dle.getName());
+    }
+
+    @Test
+    public void testGetEntryType() {
+        assertEquals(TestParameters.entryType, dle.getEntryType());
+    }
+
+    @Test
+    public void testGetOffset() {
+        assertEquals(TestParameters.offset, dle.getOffset());
+    }
+
+    @Test
+    public void testGetLength() {
+        assertEquals(TestParameters.length, dle.getLength());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
index 2512e85..5937d18 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
@@ -1,104 +1,104 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
-
-/**
- * Holds test parameters such as verification points
- */
-public class TestParameters {
-    /* Prevents initialization */
-    private TestParameters() {
-    }
-
-    /* Tests values */
-    static final int nameLength = 5;
-    static final String entryName = TestParameters.class.getName();
-    static EntryType entryType = EntryType.COMPRESSED;
-    static final int offset = 3;
-    static final int length = 20;
-    static final int NTHREADS = 2;
-
-    static final int BUFFER_SIZE = 16384;
-
-    static final byte[] chmData = readResource("/test-documents/testChm.chm");
-
-    private static byte[] readResource(String name) {
-        try {
-            try (InputStream stream = TestParameters.class.getResourceAsStream(name)) {
-                return IOUtils.toByteArray(stream);
-            }
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    /* Verification points */
-    static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm";
-    static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments";
-    static final String VP_ISTF_SIGNATURE = "ITSF";
-    static final String VP_ISTP_SIGNATURE = "ITSP";
-    static final String VP_PMGL_SIGNATURE = "PMGL";
-    static final String VP_CONTROL_DATA_SIGNATURE = "LZXC";
-
-    static final int VP_DIRECTORY_LENGTH = 4180;
-    static final int VP_DATA_OFFSET_LENGTH = 4300;
-    static final int VP_DIRECTORY_OFFSET = 120;
-    static final int VP_ITSF_HEADER_LENGTH = 96;
-    static final int VP_LANGUAGE_ID = 1033;
-    static final int VP_LAST_MODIFIED = 1042357880;
-    static final int VP_UNKNOWN_000C = 1;
-    static final int VP_UNKNOWN_LEN = 24;
-    static final int VP_UNKNOWN_OFFSET = 96;
-    static final int VP_VERSION = 3;
-    static final int VP_BLOCK_LENGTH = 4096;
-    static final int VP_BLOCK_INDEX_INTERVAL = 2;
-    static final int VP_ITSP_HEADER_LENGTH = 84;
-    static final int VP_INDEX_DEPTH = 1;
-    static final int VP_INDEX_HEAD = 0;
-    static final int VP_INDEX_ROOT = -1;
-    static final int VP_UNKNOWN_NUM_BLOCKS = -1;
-    static final int VP_ITSP_UNKNOWN_000C = 10;
-    static final int VP_ITSP_UNKNOWN_0024 = 0;
-    static final int VP_ITSP_UNKNOWN_002C = 1;
-    static final int VP_ITSP_BYTEARR_LEN = 16;
-    static final int VP_ITSP_VERSION = 1;
-    static final int VP_RESET_INTERVAL = 2;
-    static final int VP_CONTROL_DATA_SIZE = 6;
-    static final int VP_UNKNOWN_18 = 0;
-    static final int VP_CONTROL_DATA_VERSION = 2;
-    static final int VP_WINDOW_SIZE = 65536;
-    static final int VP_WINDOWS_PER_RESET = 1;
-    static final int VP_CHM_ENTITIES_NUMBER = 100; //updated  by Hawking
-    static final int VP_PMGI_FREE_SPACE = 3;
-    static final int VP_PMGL_BLOCK_NEXT = -1;
-    static final int VP_PMGL_BLOCK_PREV = -1;
-    static final int VP_PMGL_FREE_SPACE = 1644;
-    static final int VP_PMGL_UNKNOWN_008 = 0;
-    static final int VP_RESET_TABLE_BA = 12;
-    static final int VP_RES_TBL_BLOCK_LENGTH = 32768;
-    static final int VP_RES_TBL_COMPR_LENGTH = 177408;
-    static final int VP_RES_TBL_UNCOMP_LENGTH = 383786;
-    static final int VP_TBL_OFFSET = 40;
-    static final int VP_RES_TBL_UNKNOWN = 8;
-    static final int VP_RES_TBL_VERSION = 2;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+
+/**
+ * Holds test parameters such as verification points
+ */
+public class TestParameters {
+    /* Prevents initialization */
+    private TestParameters() {
+    }
+
+    /* Tests values */
+    static final int nameLength = 5;
+    static final String entryName = TestParameters.class.getName();
+    static EntryType entryType = EntryType.COMPRESSED;
+    static final int offset = 3;
+    static final int length = 20;
+    static final int NTHREADS = 2;
+
+    static final int BUFFER_SIZE = 16384;
+
+    static final byte[] chmData = readResource("/test-documents/testChm.chm");
+
+    private static byte[] readResource(String name) {
+        try {
+            try (InputStream stream = TestParameters.class.getResourceAsStream(name)) {
+                return IOUtils.toByteArray(stream);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /* Verification points */
+    static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm";
+    static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments";
+    static final String VP_ISTF_SIGNATURE = "ITSF";
+    static final String VP_ISTP_SIGNATURE = "ITSP";
+    static final String VP_PMGL_SIGNATURE = "PMGL";
+    static final String VP_CONTROL_DATA_SIGNATURE = "LZXC";
+
+    static final int VP_DIRECTORY_LENGTH = 4180;
+    static final int VP_DATA_OFFSET_LENGTH = 4300;
+    static final int VP_DIRECTORY_OFFSET = 120;
+    static final int VP_ITSF_HEADER_LENGTH = 96;
+    static final int VP_LANGUAGE_ID = 1033;
+    static final int VP_LAST_MODIFIED = 1042357880;
+    static final int VP_UNKNOWN_000C = 1;
+    static final int VP_UNKNOWN_LEN = 24;
+    static final int VP_UNKNOWN_OFFSET = 96;
+    static final int VP_VERSION = 3;
+    static final int VP_BLOCK_LENGTH = 4096;
+    static final int VP_BLOCK_INDEX_INTERVAL = 2;
+    static final int VP_ITSP_HEADER_LENGTH = 84;
+    static final int VP_INDEX_DEPTH = 1;
+    static final int VP_INDEX_HEAD = 0;
+    static final int VP_INDEX_ROOT = -1;
+    static final int VP_UNKNOWN_NUM_BLOCKS = -1;
+    static final int VP_ITSP_UNKNOWN_000C = 10;
+    static final int VP_ITSP_UNKNOWN_0024 = 0;
+    static final int VP_ITSP_UNKNOWN_002C = 1;
+    static final int VP_ITSP_BYTEARR_LEN = 16;
+    static final int VP_ITSP_VERSION = 1;
+    static final int VP_RESET_INTERVAL = 2;
+    static final int VP_CONTROL_DATA_SIZE = 6;
+    static final int VP_UNKNOWN_18 = 0;
+    static final int VP_CONTROL_DATA_VERSION = 2;
+    static final int VP_WINDOW_SIZE = 65536;
+    static final int VP_WINDOWS_PER_RESET = 1;
+    static final int VP_CHM_ENTITIES_NUMBER = 100; //updated  by Hawking
+    static final int VP_PMGI_FREE_SPACE = 3;
+    static final int VP_PMGL_BLOCK_NEXT = -1;
+    static final int VP_PMGL_BLOCK_PREV = -1;
+    static final int VP_PMGL_FREE_SPACE = 1644;
+    static final int VP_PMGL_UNKNOWN_008 = 0;
+    static final int VP_RESET_TABLE_BA = 12;
+    static final int VP_RES_TBL_BLOCK_LENGTH = 32768;
+    static final int VP_RES_TBL_COMPR_LENGTH = 177408;
+    static final int VP_RES_TBL_UNCOMP_LENGTH = 383786;
+    static final int VP_TBL_OFFSET = 40;
+    static final int VP_RES_TBL_UNKNOWN = 8;
+    static final int VP_RES_TBL_VERSION = 2;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
index 493c03e..070583b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
@@ -1,45 +1,45 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmPmgiHeader;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestPmgiHeader {
-    ChmPmgiHeader chmPmgiHeader = null;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-        chmPmgiHeader = new ChmPmgiHeader();
-        chmPmgiHeader.parse(data, chmPmgiHeader);
-    }
-
-    @Test
-    public void testToString() {
-        assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0));
-    }
-
-    @Test
-    public void testGetFreeSpace() {
-        assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace());
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmPmgiHeader;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestPmgiHeader {
+    ChmPmgiHeader chmPmgiHeader = null;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+        chmPmgiHeader = new ChmPmgiHeader();
+        chmPmgiHeader.parse(data, chmPmgiHeader);
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0));
+    }
+
+    @Test
+    public void testGetFreeSpace() {
+        assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
index f8652da..55c08f2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
@@ -1,76 +1,76 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmPmglHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestPmglHeader {
-    ChmPmglHeader chmPmglHeader = null;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-        chmPmglHeader = new ChmPmglHeader();
-        chmPmglHeader.parse(ChmCommons.copyOfRange(data,
-                ChmConstants.START_PMGL, ChmConstants.START_PMGL
-                        + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader);
-    }
-
-    @Test
-    public void testToString() {
-        assertTrue((chmPmglHeader != null)
-                && chmPmglHeader.toString().length() > 0);
-    }
-
-    @Test
-    public void testChmPmglHeaderGet() {
-        assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String(
-                chmPmglHeader.getSignature(), UTF_8));
-    }
-
-    @Test
-    public void testGetBlockNext() {
-        assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT,
-                chmPmglHeader.getBlockNext());
-    }
-
-    @Test
-    public void testGetBlockPrev() {
-        assertEquals(TestParameters.VP_PMGL_BLOCK_PREV,
-                chmPmglHeader.getBlockPrev());
-    }
-
-    @Test
-    public void testGetFreeSpace() {
-        assertEquals(TestParameters.VP_PMGL_FREE_SPACE,
-                chmPmglHeader.getFreeSpace());
-    }
-
-    @Test
-    public void testGetUnknown0008() {
-        assertEquals(TestParameters.VP_PMGL_UNKNOWN_008,
-                chmPmglHeader.getUnknown0008());
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmPmglHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestPmglHeader {
+    ChmPmglHeader chmPmglHeader = null;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+        chmPmglHeader = new ChmPmglHeader();
+        chmPmglHeader.parse(ChmCommons.copyOfRange(data,
+                ChmConstants.START_PMGL, ChmConstants.START_PMGL
+                        + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader);
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue((chmPmglHeader != null)
+                && chmPmglHeader.toString().length() > 0);
+    }
+
+    @Test
+    public void testChmPmglHeaderGet() {
+        assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String(
+                chmPmglHeader.getSignature(), UTF_8));
+    }
+
+    @Test
+    public void testGetBlockNext() {
+        assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT,
+                chmPmglHeader.getBlockNext());
+    }
+
+    @Test
+    public void testGetBlockPrev() {
+        assertEquals(TestParameters.VP_PMGL_BLOCK_PREV,
+                chmPmglHeader.getBlockPrev());
+    }
+
+    @Test
+    public void testGetFreeSpace() {
+        assertEquals(TestParameters.VP_PMGL_FREE_SPACE,
+                chmPmglHeader.getFreeSpace());
+    }
+
+    @Test
+    public void testGetUnknown0008() {
+        assertEquals(TestParameters.VP_PMGL_UNKNOWN_008,
+                chmPmglHeader.getUnknown0008());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 78761fe..6ef803d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-import java.util.Map;
-
-import org.apache.tika.detect.TypeDetector;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class MboxParserTest {
-
-    protected ParseContext recursingContext;
-    private Parser autoDetectParser;
-    private TypeDetector typeDetector;
-    private MboxParser mboxParser;
-
-    private static InputStream getStream(String name) {
-        return MboxParserTest.class.getClass().getResourceAsStream(name);
-    }
-
-    @Before
-    public void setUp() throws Exception {
-        typeDetector = new TypeDetector();
-        autoDetectParser = new AutoDetectParser(typeDetector);
-        recursingContext = new ParseContext();
-        recursingContext.set(Parser.class, autoDetectParser);
-
-        mboxParser = new MboxParser();
-        mboxParser.setTracking(true);
-    }
-
-    @Test
-    public void testSimple() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = getStream("/test-documents/simple.mbox")) {
-            mboxParser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        String content = handler.toString();
-        assertContains("Test content 1", content);
-        assertContains("Test content 2", content);
-        assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
-
-        Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
-        assertEquals("Nb. Of mails", 2, mailsMetadata.size());
-
-        Metadata mail1 = mailsMetadata.get(0);
-        assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
-        assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
-
-        Metadata mail2 = mailsMetadata.get(1);
-        assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
-        assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
-    }
-
-    @Test
-    public void testHeaders() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = getStream("/test-documents/headers.mbox")) {
-            mboxParser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertContains("Test content", handler.toString());
-        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
-        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
-
-        assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
-        assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
-        assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
-        assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("author@domain.com", mailMetadata.get("Message-From"));
-        assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
-    }
-
-    @Test
-    public void testMultilineHeader() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
-            mboxParser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
-        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
-        assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
-    }
-
-    @Test
-    public void testQuoted() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
-            mboxParser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertContains("Test content", handler.toString());
-        assertContains("> quoted stuff", handler.toString());
-    }
-
-    @Test
-    public void testComplex() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = getStream("/test-documents/complex.mbox")) {
-            mboxParser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
-
-        Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
-        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
-        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
-        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
-        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
-        assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
-
-        assertContains("When a Mapper completes", handler.toString());
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class MboxParserTest {
+
+    protected ParseContext recursingContext;
+    private Parser autoDetectParser;
+    private TypeDetector typeDetector;
+    private MboxParser mboxParser;
+
+    private static InputStream getStream(String name) {
+        return MboxParserTest.class.getClass().getResourceAsStream(name);
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        typeDetector = new TypeDetector();
+        autoDetectParser = new AutoDetectParser(typeDetector);
+        recursingContext = new ParseContext();
+        recursingContext.set(Parser.class, autoDetectParser);
+
+        mboxParser = new MboxParser();
+        mboxParser.setTracking(true);
+    }
+
+    @Test
+    public void testSimple() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/simple.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        String content = handler.toString();
+        assertContains("Test content 1", content);
+        assertContains("Test content 2", content);
+        assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+        Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
+        assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+        Metadata mail1 = mailsMetadata.get(0);
+        assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
+
+        Metadata mail2 = mailsMetadata.get(1);
+        assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
+    }
+
+    @Test
+    public void testHeaders() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/headers.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+        assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
+        assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+        assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
+        assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("author@domain.com", mailMetadata.get("Message-From"));
+        assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
+    }
+
+    @Test
+    public void testMultilineHeader() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
+    }
+
+    @Test
+    public void testQuoted() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertContains("> quoted stuff", handler.toString());
+    }
+
+    @Test
+    public void testComplex() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/complex.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
+
+        Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
+        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
+        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
+        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
+        assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+        assertContains("When a Mapper completes", handler.toString());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
index 1d2904c..89a1b86 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
@@ -1,110 +1,110 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.ToHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class OutlookPSTParserTest extends TikaTest {
-
-  private Parser parser = new OutlookPSTParser();
-
-  @Test
-  public void testAccept() throws Exception {
-    assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
-  }
-
-  @Test
-  public void testParse() throws Exception {
-    Parser pstParser = new AutoDetectParser();
-    Metadata metadata = new Metadata();
-    ContentHandler handler = new ToHTMLContentHandler();
-
-    ParseContext context = new ParseContext();
-    EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
-    context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
-    context.set(Parser.class, new AutoDetectParser());
-
-    pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
-
-    String output = handler.toString();
-
-    assertFalse(output.isEmpty());
-    assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
-    assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
-
-    assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
-    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;530D9CAC.5080901@gmail.com&gt;\"><h1>Re: Feature Generators</h1>"));
-    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com&gt;\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
-    assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
-
-    assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
-
-
-    List<Metadata> metaList = trackingExtrator.trackingMetadata;
-    assertEquals(6, metaList.size());
-
-    Metadata firstMail = metaList.get(0);
-    assertEquals("J�rn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
-    assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
-    assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
-    assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
-    assertEquals("", firstMail.get("displayCC"));
-    assertEquals("", firstMail.get("displayBCC"));
-  }
-
-
-  private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
-    List<Metadata> trackingMetadata = new ArrayList<Metadata>();
-
-    public EmbeddedTrackingExtrator(ParseContext context) {
-      super(context);
-    }
-
-    @Override
-    public boolean shouldParseEmbedded(Metadata metadata) {
-      return true;
-    }
-
-    @Override
-    public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
-      this.trackingMetadata.add(metadata);
-      super.parseEmbedded(stream, handler, metadata, outputHtml);
-    }
-
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class OutlookPSTParserTest extends TikaTest {
+
+  private Parser parser = new OutlookPSTParser();
+
+  @Test
+  public void testAccept() throws Exception {
+    assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
+  }
+
+  @Test
+  public void testParse() throws Exception {
+    Parser pstParser = new AutoDetectParser();
+    Metadata metadata = new Metadata();
+    ContentHandler handler = new ToHTMLContentHandler();
+
+    ParseContext context = new ParseContext();
+    EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
+    context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
+    context.set(Parser.class, new AutoDetectParser());
+
+    pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
+
+    String output = handler.toString();
+
+    assertFalse(output.isEmpty());
+    assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
+    assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
+
+    assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;530D9CAC.5080901@gmail.com&gt;\"><h1>Re: Feature Generators</h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com&gt;\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
+    assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
+
+    assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
+
+
+    List<Metadata> metaList = trackingExtrator.trackingMetadata;
+    assertEquals(6, metaList.size());
+
+    Metadata firstMail = metaList.get(0);
+    assertEquals("J�rn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
+    assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
+    assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
+    assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
+    assertEquals("", firstMail.get("displayCC"));
+    assertEquals("", firstMail.get("displayBCC"));
+  }
+
+
+  private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
+    List<Metadata> trackingMetadata = new ArrayList<Metadata>();
+
+    public EmbeddedTrackingExtrator(ParseContext context) {
+      super(context);
+    }
+
+    @Override
+    public boolean shouldParseEmbedded(Metadata metadata) {
+      return true;
+    }
+
+    @Override
+    public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+      this.trackingMetadata.add(metadata);
+      super.parseEmbedded(stream, handler, metadata, outputHtml);
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
index 2b3d141..f454446 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
@@ -1,75 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import java.net.URL;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Parent class of tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public abstract class AbstractPOIContainerExtractionTest extends TikaTest {
-    public static final MediaType TYPE_DOC = MediaType.application("msword");
-    public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
-    public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
-    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
-    public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-    public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-    public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
-
-    public static final MediaType TYPE_TXT = MediaType.text("plain");
-    public static final MediaType TYPE_PDF = MediaType.application("pdf");
-
-    public static final MediaType TYPE_JPG = MediaType.image("jpeg");
-    public static final MediaType TYPE_GIF = MediaType.image("gif");
-    public static final MediaType TYPE_PNG = MediaType.image("png");
-    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
-    public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
-
-    protected static TikaInputStream getTestFile(String filename) throws Exception {
-        URL input = AbstractPOIContainerExtractionTest.class.getResource(
-                "/test-documents/" + filename);
-        assertNotNull(filename + " not found", input);
-
-        return TikaInputStream.get(input);
-    }
-
-    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
-        try (TikaInputStream stream = getTestFile(filename)) {
-            assertEquals(true, extractor.isSupported(stream));
-
-            // Process it
-            TrackingHandler handler = new TrackingHandler();
-            if (recurse) {
-                extractor.extract(stream, extractor, handler);
-            } else {
-                extractor.extract(stream, null, handler);
-            }
-
-            // So they can check what happened
-            return handler;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.net.URL;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Parent class of tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public abstract class AbstractPOIContainerExtractionTest extends TikaTest {
+    public static final MediaType TYPE_DOC = MediaType.application("msword");
+    public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
+    public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+    public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
+
+    public static final MediaType TYPE_TXT = MediaType.text("plain");
+    public static final MediaType TYPE_PDF = MediaType.application("pdf");
+
+    public static final MediaType TYPE_JPG = MediaType.image("jpeg");
+    public static final MediaType TYPE_GIF = MediaType.image("gif");
+    public static final MediaType TYPE_PNG = MediaType.image("png");
+    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+    public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
+
+    protected static TikaInputStream getTestFile(String filename) throws Exception {
+        URL input = AbstractPOIContainerExtractionTest.class.getResource(
+                "/test-documents/" + filename);
+        assertNotNull(filename + " not found", input);
+
+        return TikaInputStream.get(input);
+    }
+
+    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
+        try (TikaInputStream stream = getTestFile(filename)) {
+            assertEquals(true, extractor.isSupported(stream));
+
+            // Process it
+            TrackingHandler handler = new TrackingHandler();
+            if (recurse) {
+                extractor.extract(stream, extractor, handler);
+            } else {
+                extractor.extract(stream, null, handler);
+            }
+
+            // So they can check what happened
+            return handler;
+        }
+    }
+}

[34/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
index 16dd37f..a6c2e9d 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
@@ -1,139 +1,139 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import javax.sound.sampled.AudioFileFormat;
-import javax.sound.sampled.AudioFileFormat.Type;
-import javax.sound.sampled.AudioFormat;
-import javax.sound.sampled.AudioSystem;
-import javax.sound.sampled.UnsupportedAudioFileException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class AudioParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -6015684081240882695L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.audio("basic"),
-                MediaType.audio("x-wav"),
-                MediaType.audio("x-aiff"))));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // AudioSystem expects the stream to support the mark feature
-        if (!stream.markSupported()) {
-            stream = new BufferedInputStream(stream);
-        }
-        try {
-            AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
-            Type type = fileFormat.getType();
-            if (type == Type.AIFC || type == Type.AIFF) {
-                metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
-            } else if (type == Type.AU || type == Type.SND) {
-                metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
-            } else if (type == Type.WAVE) {
-                metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
-            }
-
-            AudioFormat audioFormat = fileFormat.getFormat();
-            int channels = audioFormat.getChannels();
-            if (channels != AudioSystem.NOT_SPECIFIED) {
-                metadata.set("channels", String.valueOf(channels));
-                // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
-            }
-            float rate = audioFormat.getSampleRate();
-            if (rate != AudioSystem.NOT_SPECIFIED) {
-                metadata.set("samplerate", String.valueOf(rate));
-                metadata.set(
-                        XMPDM.AUDIO_SAMPLE_RATE,
-                        Integer.toString((int) rate));
-            }
-            int bits = audioFormat.getSampleSizeInBits();
-            if (bits != AudioSystem.NOT_SPECIFIED) {
-                metadata.set("bits", String.valueOf(bits));
-                if (bits == 8) {
-                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
-                } else if (bits == 16) {
-                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
-                } else if (bits == 32) {
-                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
-                }
-            }
-            metadata.set("encoding", audioFormat.getEncoding().toString());
-
-            // Javadoc suggests that some of the following properties might
-            // be available, but I had no success in finding any:
-
-            // "duration" Long playback duration of the file in microseconds
-            // "author" String name of the author of this file
-            // "title" String title of this file
-            // "copyright" String copyright message
-            // "date" Date date of the recording or release
-            // "comment" String an arbitrary text
-
-            addMetadata(metadata, fileFormat.properties());
-            addMetadata(metadata, audioFormat.properties());
-        } catch (UnsupportedAudioFileException e) {
-            // There is no way to know whether this exception was
-            // caused by the document being corrupted or by the format
-            // just being unsupported. So we do nothing.
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-    private void addMetadata(Metadata metadata, Map<String, Object> properties) {
-        if (properties != null) {
-            for (Entry<String, Object> entry : properties.entrySet()) {
-                Object value = entry.getValue();
-                if (value != null) {
-                    metadata.set(entry.getKey(), value.toString());
-                }
-            }
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import javax.sound.sampled.AudioFileFormat;
+import javax.sound.sampled.AudioFileFormat.Type;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.UnsupportedAudioFileException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AudioParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -6015684081240882695L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.audio("basic"),
+                MediaType.audio("x-wav"),
+                MediaType.audio("x-aiff"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // AudioSystem expects the stream to support the mark feature
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+        try {
+            AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
+            Type type = fileFormat.getType();
+            if (type == Type.AIFC || type == Type.AIFF) {
+                metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
+            } else if (type == Type.AU || type == Type.SND) {
+                metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
+            } else if (type == Type.WAVE) {
+                metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
+            }
+
+            AudioFormat audioFormat = fileFormat.getFormat();
+            int channels = audioFormat.getChannels();
+            if (channels != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("channels", String.valueOf(channels));
+                // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
+            }
+            float rate = audioFormat.getSampleRate();
+            if (rate != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("samplerate", String.valueOf(rate));
+                metadata.set(
+                        XMPDM.AUDIO_SAMPLE_RATE,
+                        Integer.toString((int) rate));
+            }
+            int bits = audioFormat.getSampleSizeInBits();
+            if (bits != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("bits", String.valueOf(bits));
+                if (bits == 8) {
+                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
+                } else if (bits == 16) {
+                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
+                } else if (bits == 32) {
+                    metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
+                }
+            }
+            metadata.set("encoding", audioFormat.getEncoding().toString());
+
+            // Javadoc suggests that some of the following properties might
+            // be available, but I had no success in finding any:
+
+            // "duration" Long playback duration of the file in microseconds
+            // "author" String name of the author of this file
+            // "title" String title of this file
+            // "copyright" String copyright message
+            // "date" Date date of the recording or release
+            // "comment" String an arbitrary text
+
+            addMetadata(metadata, fileFormat.properties());
+            addMetadata(metadata, audioFormat.properties());
+        } catch (UnsupportedAudioFileException e) {
+            // There is no way to know whether this exception was
+            // caused by the document being corrupted or by the format
+            // just being unsupported. So we do nothing.
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+    private void addMetadata(Metadata metadata, Map<String, Object> properties) {
+        if (properties != null) {
+            for (Entry<String, Object> entry : properties.entrySet()) {
+                Object value = entry.getValue();
+                if (value != null) {
+                    metadata.set(entry.getKey(), value.toString());
+                }
+            }
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
index c777287..656d1aa 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
@@ -1,121 +1,121 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.sound.midi.InvalidMidiDataException;
-import javax.sound.midi.MetaMessage;
-import javax.sound.midi.MidiMessage;
-import javax.sound.midi.MidiSystem;
-import javax.sound.midi.Patch;
-import javax.sound.midi.Sequence;
-import javax.sound.midi.Track;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-public class MidiParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 6343278584336189432L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.application("x-midi"),
-                MediaType.audio("midi"))));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        // MidiSystem expects the stream to support the mark feature
-        InputStream buffered = new BufferedInputStream(stream);
-        try {
-            Sequence sequence = MidiSystem.getSequence(buffered);
-
-            Track[] tracks = sequence.getTracks();
-            metadata.set("tracks", String.valueOf(tracks.length));
-            // TODO: Use XMPDM.TRACKS?
-
-            Patch[] patches = sequence.getPatchList();
-            metadata.set("patches", String.valueOf(patches.length));
-
-            float type = sequence.getDivisionType();
-            if (type == Sequence.PPQ) {
-                metadata.set("divisionType", "PPQ");
-            } else if (type == Sequence.SMPTE_24) {
-                metadata.set("divisionType", "SMPTE_24");
-            } else if (type == Sequence.SMPTE_25) {
-                metadata.set("divisionType", "SMPTE_25");
-            } else if (type == Sequence.SMPTE_30) {
-                metadata.set("divisionType", "SMPTE_30");
-            } else if (type == Sequence.SMPTE_30DROP) {
-                metadata.set("divisionType", "SMPTE_30DROP");
-            } else if (type == Sequence.SMPTE_24) {
-                metadata.set("divisionType", String.valueOf(type));
-            }
-
-            for (Track track : tracks) {
-                xhtml.startElement("p");
-                for (int i = 0; i < track.size(); i++) {
-                    MidiMessage message = track.get(i).getMessage();
-                    if (message instanceof MetaMessage) {
-                        MetaMessage meta = (MetaMessage) message;
-                        // Types 1-15 are reserved for text events
-                        if (meta.getType() >= 1 && meta.getType() <= 15) {
-                            // FIXME: What's the encoding?
-                            xhtml.characters(
-                                    new String(meta.getData(), ISO_8859_1));
-                        }
-                    }
-                }
-                xhtml.endElement("p");
-            }
-        } catch (InvalidMidiDataException ignore) {
-            // There is no way to know whether this exception was
-            // caused by the document being corrupted or by the format
-            // just being unsupported. So we do nothing.
-        }
-
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
+import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+public class MidiParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 6343278584336189432L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("x-midi"),
+                MediaType.audio("midi"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // MidiSystem expects the stream to support the mark feature
+        InputStream buffered = new BufferedInputStream(stream);
+        try {
+            Sequence sequence = MidiSystem.getSequence(buffered);
+
+            Track[] tracks = sequence.getTracks();
+            metadata.set("tracks", String.valueOf(tracks.length));
+            // TODO: Use XMPDM.TRACKS?
+
+            Patch[] patches = sequence.getPatchList();
+            metadata.set("patches", String.valueOf(patches.length));
+
+            float type = sequence.getDivisionType();
+            if (type == Sequence.PPQ) {
+                metadata.set("divisionType", "PPQ");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", "SMPTE_24");
+            } else if (type == Sequence.SMPTE_25) {
+                metadata.set("divisionType", "SMPTE_25");
+            } else if (type == Sequence.SMPTE_30) {
+                metadata.set("divisionType", "SMPTE_30");
+            } else if (type == Sequence.SMPTE_30DROP) {
+                metadata.set("divisionType", "SMPTE_30DROP");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", String.valueOf(type));
+            }
+
+            for (Track track : tracks) {
+                xhtml.startElement("p");
+                for (int i = 0; i < track.size(); i++) {
+                    MidiMessage message = track.get(i).getMessage();
+                    if (message instanceof MetaMessage) {
+                        MetaMessage meta = (MetaMessage) message;
+                        // Types 1-15 are reserved for text events
+                        if (meta.getType() >= 1 && meta.getType() <= 15) {
+                            // FIXME: What's the encoding?
+                            xhtml.characters(
+                                    new String(meta.getData(), ISO_8859_1));
+                        }
+                    }
+                }
+                xhtml.endElement("p");
+            }
+        } catch (InvalidMidiDataException ignore) {
+            // There is no way to know whether this exception was
+            // caused by the document being corrupted or by the format
+            // just being unsupported. So we do nothing.
+        }
+
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
index d8a6539..c207e0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
@@ -1,111 +1,111 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.font;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.fontbox.ttf.NameRecord;
-import org.apache.fontbox.ttf.NamingTable;
-import org.apache.fontbox.ttf.TTFParser;
-import org.apache.fontbox.ttf.TrueTypeFont;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for TrueType font files (TTF).
- */
-public class TrueTypeParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 44788554612243032L;
-
-    private static final MediaType TYPE =
-        MediaType.application("x-font-ttf");
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(TYPE);
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        TikaInputStream tis = TikaInputStream.cast(stream);
-        
-        // Ask FontBox to parse the file for us
-        TrueTypeFont font;
-        TTFParser parser = new TTFParser();
-        if (tis != null && tis.hasFile()) {
-            font = parser.parse(tis.getFile());
-        } else {
-            font = parser.parse(stream);
-        }
-
-        // Report the details of the font
-        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-        metadata.set(TikaCoreProperties.CREATED, 
-                font.getHeader().getCreated());
-        metadata.set(TikaCoreProperties.MODIFIED,
-                font.getHeader().getModified());
-        metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
-                Float.toString(font.getHeader().getVersion()));
-        
-        // Pull out the naming info
-        NamingTable fontNaming = font.getNaming();
-        for (NameRecord nr : fontNaming.getNameRecords()) {
-            if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
-                metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
-            }
-            if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
-                metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
-            }
-            if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
-                metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
-                metadata.set(TikaCoreProperties.TITLE, nr.getString());
-            }
-            if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
-                metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
-            }
-            if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
-                metadata.set("Copyright", nr.getString());
-            }
-            if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
-                metadata.set("Trademark", nr.getString());
-            }
-        }
-        
-        // For now, we only output metadata, no textual contents
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.NameRecord;
+import org.apache.fontbox.ttf.NamingTable;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for TrueType font files (TTF).
+ */
+public class TrueTypeParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 44788554612243032L;
+
+    private static final MediaType TYPE =
+        MediaType.application("x-font-ttf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TikaInputStream tis = TikaInputStream.cast(stream);
+        
+        // Ask FontBox to parse the file for us
+        TrueTypeFont font;
+        TTFParser parser = new TTFParser();
+        if (tis != null && tis.hasFile()) {
+            font = parser.parse(tis.getFile());
+        } else {
+            font = parser.parse(stream);
+        }
+
+        // Report the details of the font
+        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+        metadata.set(TikaCoreProperties.CREATED, 
+                font.getHeader().getCreated());
+        metadata.set(TikaCoreProperties.MODIFIED,
+                font.getHeader().getModified());
+        metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
+                Float.toString(font.getHeader().getVersion()));
+        
+        // Pull out the naming info
+        NamingTable fontNaming = font.getNaming();
+        for (NameRecord nr : fontNaming.getNameRecords()) {
+            if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
+                metadata.set(TikaCoreProperties.TITLE, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
+                metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
+                metadata.set("Copyright", nr.getString());
+            }
+            if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
+                metadata.set("Trademark", nr.getString());
+            }
+        }
+        
+        // For now, we only output metadata, no textual contents
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 39044d3..bf29d0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -1,562 +1,562 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DecimalFormat;
-import java.text.DecimalFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import com.drew.imaging.jpeg.JpegMetadataReader;
-import com.drew.imaging.jpeg.JpegProcessingException;
-import com.drew.imaging.riff.RiffProcessingException;
-import com.drew.imaging.tiff.TiffMetadataReader;
-import com.drew.imaging.tiff.TiffProcessingException;
-import com.drew.imaging.webp.WebpMetadataReader;
-import com.drew.lang.ByteArrayReader;
-import com.drew.lang.GeoLocation;
-import com.drew.lang.Rational;
-import com.drew.metadata.Directory;
-import com.drew.metadata.MetadataException;
-import com.drew.metadata.Tag;
-import com.drew.metadata.exif.ExifIFD0Directory;
-import com.drew.metadata.exif.ExifReader;
-import com.drew.metadata.exif.ExifSubIFDDirectory;
-import com.drew.metadata.exif.ExifThumbnailDirectory;
-import com.drew.metadata.exif.GpsDirectory;
-import com.drew.metadata.iptc.IptcDirectory;
-import com.drew.metadata.jpeg.JpegCommentDirectory;
-import com.drew.metadata.jpeg.JpegDirectory;
-import org.apache.commons.io.IOUtils;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.IPTC;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.w3c.dom.Document;
-import org.xml.sax.SAXException;
-
-/**
- * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
- * to read EXIF and IPTC image metadata and map to Tika fields.
- * <p/>
- * As of 2.4.0 the library supports jpeg and tiff.
- * As of 2.8.0 the library supports webp.
- */
-public class ImageMetadataExtractor {
-
-    private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
-    private final Metadata metadata;
-    private DirectoryHandler[] handlers;
-
-    /**
-     * @param metadata to extract to, using default directory handlers
-     */
-    public ImageMetadataExtractor(Metadata metadata) {
-        this(metadata,
-                new CopyUnknownFieldsHandler(),
-                new JpegCommentHandler(),
-                new ExifHandler(),
-                new DimensionsHandler(),
-                new GeotagHandler(),
-                new IptcHandler()
-        );
-    }
-
-    /**
-     * @param metadata to extract to
-     * @param handlers handlers in order, note that handlers may override values from earlier handlers
-     */
-    public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
-        this.metadata = metadata;
-        this.handlers = handlers;
-    }
-
-    private static String trimPixels(String s) {
-        //if height/width appears as "100 pixels", trim " pixels"
-        if (s != null) {
-            int i = s.lastIndexOf(" pixels");
-            s = s.substring(0, i);
-        }
-        return s;
-    }
-
-    public void parseJpeg(File file)
-            throws IOException, SAXException, TikaException {
-        try {
-            com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
-            handle(jpegMetadata);
-        } catch (JpegProcessingException e) {
-            throw new TikaException("Can't read JPEG metadata", e);
-        } catch (MetadataException e) {
-            throw new TikaException("Can't read JPEG metadata", e);
-        }
-    }
-
-    public void parseTiff(File file)
-            throws IOException, SAXException, TikaException {
-        try {
-            com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
-            handle(tiffMetadata);
-        } catch (MetadataException e) {
-            throw new TikaException("Can't read TIFF metadata", e);
-        } catch (TiffProcessingException e) {
-            throw new TikaException("Can't read TIFF metadata", e);
-        }
-    }
-
-    public void parseWebP(File file) throws IOException, TikaException {
-
-        try {
-            com.drew.metadata.Metadata webPMetadata = new com.drew.metadata.Metadata();
-            webPMetadata = WebpMetadataReader.readMetadata(file);
-            handle(webPMetadata);
-        } catch (IOException e) {
-            throw e;
-        } catch (RiffProcessingException e) {
-            throw new TikaException("Can't process Riff data", e);
-        } catch (MetadataException e) {
-            throw new TikaException("Can't process Riff data", e);
-        }
-    }
-
-    public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
-            throws IOException, SAXException, TikaException {
-        byte[] exif;
-        if (needsExifHeader) {
-            exif = new byte[length + 6];
-            exif[0] = (byte) 'E';
-            exif[1] = (byte) 'x';
-            exif[2] = (byte) 'i';
-            exif[3] = (byte) 'f';
-            IOUtils.readFully(stream, exif, 6, length);
-        } else {
-            exif = new byte[length];
-            IOUtils.readFully(stream, exif, 0, length);
-        }
-        parseRawExif(exif);
-    }
-
-    public void parseRawExif(byte[] exifData)
-            throws IOException, SAXException, TikaException {
-        com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
-        ExifReader reader = new ExifReader();
-        reader.extract(new ByteArrayReader(exifData), metadata, ExifReader.JPEG_SEGMENT_PREAMBLE.length());
-
-        try {
-            handle(metadata);
-        } catch (MetadataException e) {
-            throw new TikaException("Can't process the EXIF Data", e);
-        }
-    }
-
-    public void parseRawXMP(byte[] xmpData)
-            throws IOException, SAXException, TikaException {
-        XMPMetadata xmp = null;
-        try (InputStream decoded =
-                     new ByteArrayInputStream(xmpData)
-        ) {
-            Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
-            if (dom != null) {
-                xmp = new XMPMetadata(dom);
-            }
-        } catch (IOException|SAXException e) {
-            //
-        }
-        if (xmp != null) {
-            JempboxExtractor.extractDublinCore(xmp, metadata);
-            JempboxExtractor.extractXMPMM(xmp, metadata);
-        }
-    }
-
-    /**
-     * Copies extracted tags to tika metadata using registered handlers.
-     *
-     * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
-     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
-     */
-    protected void handle(com.drew.metadata.Metadata metadataExtractor)
-            throws MetadataException {
-        handle(metadataExtractor.getDirectories().iterator());
-    }
-
-    /**
-     * Copies extracted tags to tika metadata using registered handlers.
-     *
-     * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
-     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
-     */
-    protected void handle(Iterator<Directory> directories) throws MetadataException {
-        while (directories.hasNext()) {
-            Directory directory = directories.next();
-            for (DirectoryHandler handler : handlers) {
-                if (handler.supports(directory.getClass())) {
-                    handler.handle(directory, metadata);
-                }
-            }
-        }
-    }
-
-    /**
-     * Reads one or more type of Metadata Extractor fields.
-     */
-    static interface DirectoryHandler {
-        /**
-         * @param directoryType A Metadata Extractor directory class
-         * @return true if the directory type is supported by this handler
-         */
-        boolean supports(Class<? extends Directory> directoryType);
-
-        /**
-         * @param directory extracted tags
-         * @param metadata  current tika metadata
-         * @throws MetadataException typically field extraction error, aborts all further extraction
-         */
-        void handle(Directory directory, Metadata metadata)
-                throws MetadataException;
-    }
-
-    /**
-     * Mimics the behavior from TIKA-314 of copying all extracted tags
-     * to tika metadata using field names from Metadata Extractor.
-     */
-    static class CopyAllFieldsHandler implements DirectoryHandler {
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return true;
-        }
-
-        public void handle(Directory directory, Metadata metadata)
-                throws MetadataException {
-            if (directory.getTags() != null) {
-                for (Tag tag : directory.getTags()) {
-                    metadata.set(tag.getTagName(), tag.getDescription());
-                }
-            }
-        }
-    }
-
-    /**
-     * Copies all fields regardless of directory, if the tag name
-     * is not identical to a known Metadata field name.
-     * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
-     */
-    static class CopyUnknownFieldsHandler implements DirectoryHandler {
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return true;
-        }
-
-        public void handle(Directory directory, Metadata metadata)
-                throws MetadataException {
-            if (directory.getTags() != null) {
-                for (Tag tag : directory.getTags()) {
-                    String name = tag.getTagName();
-                    if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
-                        String value = tag.getDescription().trim();
-                        if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
-                            value = Boolean.TRUE.toString();
-                        } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
-                            value = Boolean.FALSE.toString();
-                        }
-                        metadata.set(name, value);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * Basic image properties for TIFF and JPEG, at least.
-     */
-    static class DimensionsHandler implements DirectoryHandler {
-        private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
-
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return directoryType == JpegDirectory.class ||
-                    directoryType == ExifSubIFDDirectory.class ||
-                    directoryType == ExifThumbnailDirectory.class ||
-                    directoryType == ExifIFD0Directory.class;
-        }
-
-        public void handle(Directory directory, Metadata metadata) throws MetadataException {
-            // The test TIFF has width and height stored as follows according to exiv2
-            //Exif.Image.ImageWidth                        Short       1  100
-            //Exif.Image.ImageLength                       Short       1  75
-            // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
-            set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
-            set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
-            // Bits per sample, two methods of extracting, exif overrides jpeg
-            set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
-            set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
-            // Straightforward
-            set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
-        }
-
-        private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
-            if (directory.containsTag(extractTag)) {
-                Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
-                if (m.matches()) {
-                    metadata.set(metadataField, m.group(1));
-                }
-            }
-        }
-    }
-
-    static class JpegCommentHandler implements DirectoryHandler {
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return directoryType == JpegCommentDirectory.class;
-        }
-
-        public void handle(Directory directory, Metadata metadata) throws MetadataException {
-            if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
-                metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_COMMENT));
-            }
-        }
-    }
-
-    static class ExifHandler implements DirectoryHandler {
-        // There's a new ExifHandler for each file processed, so this is thread safe
-        private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ = new ThreadLocal<SimpleDateFormat>() {
-            @Override
-            protected SimpleDateFormat initialValue() {
-                return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
-            }
-        };
-
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return directoryType == ExifIFD0Directory.class ||
-                    directoryType == ExifSubIFDDirectory.class;
-        }
-
-        public void handle(Directory directory, Metadata metadata) {
-            try {
-                handleDateTags(directory, metadata);
-                handlePhotoTags(directory, metadata);
-                handleCommentTags(directory, metadata);
-            } catch (MetadataException e) {
-                // ignore date parse errors and proceed with other tags
-            }
-        }
-
-        /**
-         * EXIF may contain image description, although with undefined encoding.
-         * Use IPTC for other annotation fields, and XMP for unicode support.
-         */
-        public void handleCommentTags(Directory directory, Metadata metadata) {
-            if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
-                    directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
-                metadata.set(TikaCoreProperties.DESCRIPTION,
-                        directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
-            }
-        }
-
-        /**
-         * Maps common TIFF and EXIF tags onto the Tika
-         * TIFF image metadata namespace.
-         */
-        public void handlePhotoTags(Directory directory, Metadata metadata) {
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
-                Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
-                if (exposure instanceof Rational) {
-                    metadata.set(Metadata.EXPOSURE_TIME, ((Rational) exposure).doubleValue());
-                } else {
-                    metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
-                }
-            }
-
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
-                String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
-                if (flash != null) {
-                    if (flash.contains("Flash fired")) {
-                        metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
-                    } else if (flash.contains("Flash did not fire")) {
-                        metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
-                    } else {
-                        metadata.set(Metadata.FLASH_FIRED, flash);
-                    }
-                }
-            }
-
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
-                Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
-                if (fnumber instanceof Rational) {
-                    metadata.set(Metadata.F_NUMBER, ((Rational) fnumber).doubleValue());
-                } else {
-                    metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
-                }
-            }
-
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
-                Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
-                if (length instanceof Rational) {
-                    metadata.set(Metadata.FOCAL_LENGTH, ((Rational) length).doubleValue());
-                } else {
-                    metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
-                }
-            }
-
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
-                metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
-            }
-
-            if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
-                metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
-            }
-            if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
-                metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
-            }
-
-            if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
-                Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
-                if (length instanceof Integer) {
-                    metadata.set(Metadata.ORIENTATION, Integer.toString((Integer) length));
-                } else {
-                    metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
-                }
-            }
-
-            if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
-                metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
-            }
-
-            if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
-                Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
-                if (resolution instanceof Rational) {
-                    metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational) resolution).doubleValue());
-                } else {
-                    metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
-                }
-            }
-            if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
-                Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
-                if (resolution instanceof Rational) {
-                    metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational) resolution).doubleValue());
-                } else {
-                    metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
-                }
-            }
-            if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
-                metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
-            }
-            if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)) {
-                metadata.set(Metadata.IMAGE_WIDTH,
-                        trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
-            }
-            if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
-                metadata.set(Metadata.IMAGE_LENGTH,
-                        trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
-            }
-        }
-
-        /**
-         * Maps exif dates to metadata fields.
-         */
-        public void handleDateTags(Directory directory, Metadata metadata)
-                throws MetadataException {
-            //TODO: should we try to process ExifSubIFDDirectory.TAG_TIME_ZONE_OFFSET
-            //if it exists?
-            // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
-            Date original = null;
-            if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
-                original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
-                // Unless we have GPS time we don't know the time zone so date must be set
-                // as ISO 8601 datetime without timezone suffix (no Z or +/-)
-                if (original != null) {
-                    String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata Extractor uses
-                    metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
-                    metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
-                }
-            }
-            if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
-                Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
-                if (datetime != null) {
-                    String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(datetime);
-                    metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
-                    // If Date/Time Original does not exist this might be creation date
-                    if (metadata.get(TikaCoreProperties.CREATED) == null) {
-                        metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * Reads image comments, originally TIKA-472.
-     * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
-     */
-    static class IptcHandler implements DirectoryHandler {
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return directoryType == IptcDirectory.class;
-        }
-
-        public void handle(Directory directory, Metadata metadata)
-                throws MetadataException {
-            if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
-                String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
-                for (String k : keywords) {
-                    metadata.add(TikaCoreProperties.KEYWORDS, k);
-                }
-            }
-            if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
-                metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
-            } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
-                metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
-            }
-            if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
-                metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
-                metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
-            }
-            if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
-                metadata.set(TikaCoreProperties.DESCRIPTION,
-                        // Looks like metadata extractor returns IPTC newlines as a single carriage return,
-                        // but the exiv2 command does not so we change to line feed here because that is less surprising to users                        
-                        directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
-            }
-        }
-    }
-
-    /**
-     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
-     */
-    static class GeotagHandler implements DirectoryHandler {
-        public boolean supports(Class<? extends Directory> directoryType) {
-            return directoryType == GpsDirectory.class;
-        }
-
-        public void handle(Directory directory, Metadata metadata) throws MetadataException {
-            GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
-            if (geoLocation != null) {
-                DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
-                        new DecimalFormatSymbols(Locale.ENGLISH));
-                metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(geoLocation.getLatitude()));
-                metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(geoLocation.getLongitude()));
-            }
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.imaging.jpeg.JpegProcessingException;
+import com.drew.imaging.riff.RiffProcessingException;
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.imaging.webp.WebpMetadataReader;
+import com.drew.lang.ByteArrayReader;
+import com.drew.lang.GeoLocation;
+import com.drew.lang.Rational;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifReader;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.exif.ExifThumbnailDirectory;
+import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.iptc.IptcDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import com.drew.metadata.jpeg.JpegDirectory;
+import org.apache.commons.io.IOUtils;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.IPTC;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
+ * to read EXIF and IPTC image metadata and map to Tika fields.
+ * <p/>
+ * As of 2.4.0 the library supports jpeg and tiff.
+ * As of 2.8.0 the library supports webp.
+ */
+public class ImageMetadataExtractor {
+
+    private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
+    private final Metadata metadata;
+    private DirectoryHandler[] handlers;
+
+    /**
+     * @param metadata to extract to, using default directory handlers
+     */
+    public ImageMetadataExtractor(Metadata metadata) {
+        this(metadata,
+                new CopyUnknownFieldsHandler(),
+                new JpegCommentHandler(),
+                new ExifHandler(),
+                new DimensionsHandler(),
+                new GeotagHandler(),
+                new IptcHandler()
+        );
+    }
+
+    /**
+     * @param metadata to extract to
+     * @param handlers handlers in order, note that handlers may override values from earlier handlers
+     */
+    public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
+        this.metadata = metadata;
+        this.handlers = handlers;
+    }
+
+    private static String trimPixels(String s) {
+        //if height/width appears as "100 pixels", trim " pixels"
+        if (s != null) {
+            int i = s.lastIndexOf(" pixels");
+            s = s.substring(0, i);
+        }
+        return s;
+    }
+
+    public void parseJpeg(File file)
+            throws IOException, SAXException, TikaException {
+        try {
+            com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
+            handle(jpegMetadata);
+        } catch (JpegProcessingException e) {
+            throw new TikaException("Can't read JPEG metadata", e);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't read JPEG metadata", e);
+        }
+    }
+
+    public void parseTiff(File file)
+            throws IOException, SAXException, TikaException {
+        try {
+            com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
+            handle(tiffMetadata);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        } catch (TiffProcessingException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        }
+    }
+
+    public void parseWebP(File file) throws IOException, TikaException {
+
+        try {
+            com.drew.metadata.Metadata webPMetadata = new com.drew.metadata.Metadata();
+            webPMetadata = WebpMetadataReader.readMetadata(file);
+            handle(webPMetadata);
+        } catch (IOException e) {
+            throw e;
+        } catch (RiffProcessingException e) {
+            throw new TikaException("Can't process Riff data", e);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't process Riff data", e);
+        }
+    }
+
+    public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
+            throws IOException, SAXException, TikaException {
+        byte[] exif;
+        if (needsExifHeader) {
+            exif = new byte[length + 6];
+            exif[0] = (byte) 'E';
+            exif[1] = (byte) 'x';
+            exif[2] = (byte) 'i';
+            exif[3] = (byte) 'f';
+            IOUtils.readFully(stream, exif, 6, length);
+        } else {
+            exif = new byte[length];
+            IOUtils.readFully(stream, exif, 0, length);
+        }
+        parseRawExif(exif);
+    }
+
+    public void parseRawExif(byte[] exifData)
+            throws IOException, SAXException, TikaException {
+        com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+        ExifReader reader = new ExifReader();
+        reader.extract(new ByteArrayReader(exifData), metadata, ExifReader.JPEG_SEGMENT_PREAMBLE.length());
+
+        try {
+            handle(metadata);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't process the EXIF Data", e);
+        }
+    }
+
+    public void parseRawXMP(byte[] xmpData)
+            throws IOException, SAXException, TikaException {
+        XMPMetadata xmp = null;
+        try (InputStream decoded =
+                     new ByteArrayInputStream(xmpData)
+        ) {
+            Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
+            if (dom != null) {
+                xmp = new XMPMetadata(dom);
+            }
+        } catch (IOException|SAXException e) {
+            //
+        }
+        if (xmp != null) {
+            JempboxExtractor.extractDublinCore(xmp, metadata);
+            JempboxExtractor.extractXMPMM(xmp, metadata);
+        }
+    }
+
+    /**
+     * Copies extracted tags to tika metadata using registered handlers.
+     *
+     * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
+     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+     */
+    protected void handle(com.drew.metadata.Metadata metadataExtractor)
+            throws MetadataException {
+        handle(metadataExtractor.getDirectories().iterator());
+    }
+
+    /**
+     * Copies extracted tags to tika metadata using registered handlers.
+     *
+     * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
+     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+     */
+    protected void handle(Iterator<Directory> directories) throws MetadataException {
+        while (directories.hasNext()) {
+            Directory directory = directories.next();
+            for (DirectoryHandler handler : handlers) {
+                if (handler.supports(directory.getClass())) {
+                    handler.handle(directory, metadata);
+                }
+            }
+        }
+    }
+
+    /**
+     * Reads one or more type of Metadata Extractor fields.
+     */
+    static interface DirectoryHandler {
+        /**
+         * @param directoryType A Metadata Extractor directory class
+         * @return true if the directory type is supported by this handler
+         */
+        boolean supports(Class<? extends Directory> directoryType);
+
+        /**
+         * @param directory extracted tags
+         * @param metadata  current tika metadata
+         * @throws MetadataException typically field extraction error, aborts all further extraction
+         */
+        void handle(Directory directory, Metadata metadata)
+                throws MetadataException;
+    }
+
+    /**
+     * Mimics the behavior from TIKA-314 of copying all extracted tags
+     * to tika metadata using field names from Metadata Extractor.
+     */
+    static class CopyAllFieldsHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return true;
+        }
+
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            if (directory.getTags() != null) {
+                for (Tag tag : directory.getTags()) {
+                    metadata.set(tag.getTagName(), tag.getDescription());
+                }
+            }
+        }
+    }
+
+    /**
+     * Copies all fields regardless of directory, if the tag name
+     * is not identical to a known Metadata field name.
+     * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
+     */
+    static class CopyUnknownFieldsHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return true;
+        }
+
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            if (directory.getTags() != null) {
+                for (Tag tag : directory.getTags()) {
+                    String name = tag.getTagName();
+                    if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
+                        String value = tag.getDescription().trim();
+                        if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+                            value = Boolean.TRUE.toString();
+                        } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+                            value = Boolean.FALSE.toString();
+                        }
+                        metadata.set(name, value);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Basic image properties for TIFF and JPEG, at least.
+     */
+    static class DimensionsHandler implements DirectoryHandler {
+        private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == JpegDirectory.class ||
+                    directoryType == ExifSubIFDDirectory.class ||
+                    directoryType == ExifThumbnailDirectory.class ||
+                    directoryType == ExifIFD0Directory.class;
+        }
+
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            // The test TIFF has width and height stored as follows according to exiv2
+            //Exif.Image.ImageWidth                        Short       1  100
+            //Exif.Image.ImageLength                       Short       1  75
+            // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
+            set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+            set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+            // Bits per sample, two methods of extracting, exif overrides jpeg
+            set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
+            set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
+            // Straightforward
+            set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+        }
+
+        private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
+            if (directory.containsTag(extractTag)) {
+                Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
+                if (m.matches()) {
+                    metadata.set(metadataField, m.group(1));
+                }
+            }
+        }
+    }
+
+    static class JpegCommentHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == JpegCommentDirectory.class;
+        }
+
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
+                metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_COMMENT));
+            }
+        }
+    }
+
+    static class ExifHandler implements DirectoryHandler {
+        // There's a new ExifHandler for each file processed, so this is thread safe
+        private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ = new ThreadLocal<SimpleDateFormat>() {
+            @Override
+            protected SimpleDateFormat initialValue() {
+                return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
+            }
+        };
+
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == ExifIFD0Directory.class ||
+                    directoryType == ExifSubIFDDirectory.class;
+        }
+
+        public void handle(Directory directory, Metadata metadata) {
+            try {
+                handleDateTags(directory, metadata);
+                handlePhotoTags(directory, metadata);
+                handleCommentTags(directory, metadata);
+            } catch (MetadataException e) {
+                // ignore date parse errors and proceed with other tags
+            }
+        }
+
+        /**
+         * EXIF may contain image description, although with undefined encoding.
+         * Use IPTC for other annotation fields, and XMP for unicode support.
+         */
+        public void handleCommentTags(Directory directory, Metadata metadata) {
+            if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
+                    directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
+                metadata.set(TikaCoreProperties.DESCRIPTION,
+                        directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
+            }
+        }
+
+        /**
+         * Maps common TIFF and EXIF tags onto the Tika
+         * TIFF image metadata namespace.
+         */
+        public void handlePhotoTags(Directory directory, Metadata metadata) {
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
+                Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
+                if (exposure instanceof Rational) {
+                    metadata.set(Metadata.EXPOSURE_TIME, ((Rational) exposure).doubleValue());
+                } else {
+                    metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
+                }
+            }
+
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
+                String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
+                if (flash != null) {
+                    if (flash.contains("Flash fired")) {
+                        metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
+                    } else if (flash.contains("Flash did not fire")) {
+                        metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
+                    } else {
+                        metadata.set(Metadata.FLASH_FIRED, flash);
+                    }
+                }
+            }
+
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
+                Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
+                if (fnumber instanceof Rational) {
+                    metadata.set(Metadata.F_NUMBER, ((Rational) fnumber).doubleValue());
+                } else {
+                    metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
+                }
+            }
+
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
+                Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
+                if (length instanceof Rational) {
+                    metadata.set(Metadata.FOCAL_LENGTH, ((Rational) length).doubleValue());
+                } else {
+                    metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
+                }
+            }
+
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
+                metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
+            }
+
+            if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
+                metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
+            }
+            if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
+                metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
+            }
+
+            if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
+                Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
+                if (length instanceof Integer) {
+                    metadata.set(Metadata.ORIENTATION, Integer.toString((Integer) length));
+                } else {
+                    metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
+                }
+            }
+
+            if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
+                metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
+            }
+
+            if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
+                Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
+                if (resolution instanceof Rational) {
+                    metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational) resolution).doubleValue());
+                } else {
+                    metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
+                }
+            }
+            if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
+                Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
+                if (resolution instanceof Rational) {
+                    metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational) resolution).doubleValue());
+                } else {
+                    metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
+                }
+            }
+            if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
+                metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
+            }
+            if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)) {
+                metadata.set(Metadata.IMAGE_WIDTH,
+                        trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
+            }
+            if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
+                metadata.set(Metadata.IMAGE_LENGTH,
+                        trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
+            }
+        }
+
+        /**
+         * Maps exif dates to metadata fields.
+         */
+        public void handleDateTags(Directory directory, Metadata metadata)
+                throws MetadataException {
+            //TODO: should we try to process ExifSubIFDDirectory.TAG_TIME_ZONE_OFFSET
+            //if it exists?
+            // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
+            Date original = null;
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
+                original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
+                // Unless we have GPS time we don't know the time zone so date must be set
+                // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+                if (original != null) {
+                    String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata Extractor uses
+                    metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+                    metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+                }
+            }
+            if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
+                Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
+                if (datetime != null) {
+                    String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(datetime);
+                    metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
+                    // If Date/Time Original does not exist this might be creation date
+                    if (metadata.get(TikaCoreProperties.CREATED) == null) {
+                        metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Reads image comments, originally TIKA-472.
+     * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+     */
+    static class IptcHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == IptcDirectory.class;
+        }
+
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+                String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+                for (String k : keywords) {
+                    metadata.add(TikaCoreProperties.KEYWORDS, k);
+                }
+            }
+            if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+                metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
+            } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+                metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
+            }
+            if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+                metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+                metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+            }
+            if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+                metadata.set(TikaCoreProperties.DESCRIPTION,
+                        // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+                        // but the exiv2 command does not so we change to line feed here because that is less surprising to users                        
+                        directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
+            }
+        }
+    }
+
+    /**
+     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+     */
+    static class GeotagHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == GpsDirectory.class;
+        }
+
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
+            if (geoLocation != null) {
+                DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
+                        new DecimalFormatSymbols(Locale.ENGLISH));
+                metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(geoLocation.getLatitude()));
+                metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(geoLocation.getLongitude()));
+            }
+        }
+    }
+
+}

[33/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
index e42f542..8fd23eb 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
@@ -1,203 +1,203 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import javax.imageio.IIOException;
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-import javax.imageio.metadata.IIOMetadata;
-import javax.imageio.stream.ImageInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ImageParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = 7852529269245520335L;
-
-    private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp");
-    private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    CANONICAL_BMP_TYPE,
-                    JAVA_BMP_TYPE,
-                    MediaType.image("gif"),
-                    MediaType.image("png"),
-                    MediaType.image("vnd.wap.wbmp"),
-                    MediaType.image("x-icon"),
-                    MediaType.image("x-xcf"))));
-
-    private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
-        if (metadata.get(imageIOkey) != null) {
-            metadata.set(tikaKey, metadata.get(imageIOkey));
-        }
-    }
-
-    private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
-        if (metadata.get(imageIOkey) != null) {
-            String v = metadata.get(imageIOkey);
-            if (v.endsWith(" ")) {
-                v = v.substring(0, v.lastIndexOf(' '));
-            }
-            metadata.set(tikaProp, v);
-        }
-    }
-
-    private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
-        String[] names = imageMetadata.getMetadataFormatNames();
-        if (names == null) {
-            return;
-        }
-        for (String name : names) {
-            loadNode(metadata, imageMetadata.getAsTree(name), "", false);
-        }
-    }
-
-    private static void loadNode(
-            Metadata metadata, Node node, String parents,
-            boolean addThisNodeName) {
-        if (addThisNodeName) {
-            if (parents.length() > 0) {
-                parents += " ";
-            }
-            parents += node.getNodeName();
-        }
-        NamedNodeMap map = node.getAttributes();
-        if (map != null) {
-
-            int length = map.getLength();
-            if (length == 1) {
-                metadata.add(parents, normalize(map.item(0).getNodeValue()));
-            } else if (length > 1) {
-                StringBuilder value = new StringBuilder();
-                for (int i = 0; i < length; i++) {
-                    if (i > 0) {
-                        value.append(", ");
-                    }
-                    Node attr = map.item(i);
-                    value.append(attr.getNodeName());
-                    value.append("=");
-                    value.append(normalize(attr.getNodeValue()));
-                }
-                metadata.add(parents, value.toString());
-            }
-        }
-
-        Node child = node.getFirstChild();
-        while (child != null) {
-            // print children recursively
-            loadNode(metadata, child, parents, true);
-            child = child.getNextSibling();
-        }
-    }
-
-    private static String normalize(String value) {
-        if (value != null) {
-            value = value.trim();
-        } else {
-            value = "";
-        }
-        if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
-            return Boolean.TRUE.toString();
-        } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
-            return Boolean.FALSE.toString();
-        }
-        return value;
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type != null) {
-            // Java has a different idea of the BMP mime type to
-            //  what the canonical one is, fix this up.
-            if (CANONICAL_BMP_TYPE.toString().equals(type)) {
-                type = JAVA_BMP_TYPE.toString();
-            }
-
-            try {
-                Iterator<ImageReader> iterator =
-                        ImageIO.getImageReadersByMIMEType(type);
-                if (iterator.hasNext()) {
-                    ImageReader reader = iterator.next();
-                    try {
-                        try (ImageInputStream imageStream = ImageIO.createImageInputStream(
-                                new CloseShieldInputStream(stream))) {
-                            reader.setInput(imageStream);
-
-                            metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
-                            metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
-                            metadata.set("height", Integer.toString(reader.getHeight(0)));
-                            metadata.set("width", Integer.toString(reader.getWidth(0)));
-
-                            loadMetadata(reader.getImageMetadata(0), metadata);
-                        }
-                    } finally {
-                        reader.dispose();
-                    }
-                }
-
-                // Translate certain Metadata tags from the ImageIO
-                //  specific namespace into the general Tika one
-                setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
-                setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
-                setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
-            } catch (IIOException e) {
-                // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
-                //  which Tika will just ignore.
-                if (!(e.getMessage() != null &&
-                        e.getMessage().equals("Unexpected block type 0!") &&
-                        type.equals("image/gif"))) {
-                    throw new TikaException(type + " parse error", e);
-                }
-            }
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7852529269245520335L;
+
+    private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp");
+    private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    CANONICAL_BMP_TYPE,
+                    JAVA_BMP_TYPE,
+                    MediaType.image("gif"),
+                    MediaType.image("png"),
+                    MediaType.image("vnd.wap.wbmp"),
+                    MediaType.image("x-icon"),
+                    MediaType.image("x-xcf"))));
+
+    private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
+        if (metadata.get(imageIOkey) != null) {
+            metadata.set(tikaKey, metadata.get(imageIOkey));
+        }
+    }
+
+    private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
+        if (metadata.get(imageIOkey) != null) {
+            String v = metadata.get(imageIOkey);
+            if (v.endsWith(" ")) {
+                v = v.substring(0, v.lastIndexOf(' '));
+            }
+            metadata.set(tikaProp, v);
+        }
+    }
+
+    private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+        String[] names = imageMetadata.getMetadataFormatNames();
+        if (names == null) {
+            return;
+        }
+        for (String name : names) {
+            loadNode(metadata, imageMetadata.getAsTree(name), "", false);
+        }
+    }
+
+    private static void loadNode(
+            Metadata metadata, Node node, String parents,
+            boolean addThisNodeName) {
+        if (addThisNodeName) {
+            if (parents.length() > 0) {
+                parents += " ";
+            }
+            parents += node.getNodeName();
+        }
+        NamedNodeMap map = node.getAttributes();
+        if (map != null) {
+
+            int length = map.getLength();
+            if (length == 1) {
+                metadata.add(parents, normalize(map.item(0).getNodeValue()));
+            } else if (length > 1) {
+                StringBuilder value = new StringBuilder();
+                for (int i = 0; i < length; i++) {
+                    if (i > 0) {
+                        value.append(", ");
+                    }
+                    Node attr = map.item(i);
+                    value.append(attr.getNodeName());
+                    value.append("=");
+                    value.append(normalize(attr.getNodeValue()));
+                }
+                metadata.add(parents, value.toString());
+            }
+        }
+
+        Node child = node.getFirstChild();
+        while (child != null) {
+            // print children recursively
+            loadNode(metadata, child, parents, true);
+            child = child.getNextSibling();
+        }
+    }
+
+    private static String normalize(String value) {
+        if (value != null) {
+            value = value.trim();
+        } else {
+            value = "";
+        }
+        if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+            return Boolean.TRUE.toString();
+        } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+            return Boolean.FALSE.toString();
+        }
+        return value;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type != null) {
+            // Java has a different idea of the BMP mime type to
+            //  what the canonical one is, fix this up.
+            if (CANONICAL_BMP_TYPE.toString().equals(type)) {
+                type = JAVA_BMP_TYPE.toString();
+            }
+
+            try {
+                Iterator<ImageReader> iterator =
+                        ImageIO.getImageReadersByMIMEType(type);
+                if (iterator.hasNext()) {
+                    ImageReader reader = iterator.next();
+                    try {
+                        try (ImageInputStream imageStream = ImageIO.createImageInputStream(
+                                new CloseShieldInputStream(stream))) {
+                            reader.setInput(imageStream);
+
+                            metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
+                            metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
+                            metadata.set("height", Integer.toString(reader.getHeight(0)));
+                            metadata.set("width", Integer.toString(reader.getWidth(0)));
+
+                            loadMetadata(reader.getImageMetadata(0), metadata);
+                        }
+                    } finally {
+                        reader.dispose();
+                    }
+                }
+
+                // Translate certain Metadata tags from the ImageIO
+                //  specific namespace into the general Tika one
+                setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
+                setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
+                setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
+            } catch (IIOException e) {
+                // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
+                //  which Tika will just ignore.
+                if (!(e.getMessage() != null &&
+                        e.getMessage().equals("Unexpected block type 0!") &&
+                        type.equals("image/gif"))) {
+                    throw new TikaException(type + " parse error", e);
+                }
+            }
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
index c3b0fce..5238751 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
@@ -1,84 +1,84 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.util.HashSet;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-
-/**
- * Knowns about all declared {@link Metadata} fields.
- * Didn't find this functionality anywhere so it was added for
- * ImageMetadataExtractor, but it can be generalized.
- */
-public abstract class MetadataFields {
-
-    private static HashSet<String> known;
-
-    static {
-        known = new HashSet<String>();
-        setKnownForClass(TikaCoreProperties.class);
-        setKnownForClass(Metadata.class);
-    }
-
-    private static void setKnownForClass(Class<?> clazz) {
-        Field[] fields = clazz.getFields();
-        for (Field f : fields) {
-            int mod = f.getModifiers();
-            if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
-                Class<?> c = f.getType();
-                if (String.class.equals(c)) {
-                    try {
-                        String p = (String) f.get(null);
-                        if (p != null) {
-                            known.add(p);
-                        }
-                    } catch (IllegalArgumentException e) {
-                        e.printStackTrace();
-                    } catch (IllegalAccessException e) {
-                        e.printStackTrace();
-                    }
-                }
-                if (Property.class.isAssignableFrom(c)) {
-                    try {
-                        Property p = (Property) f.get(null);
-                        if (p != null) {
-                            known.add(p.getName());
-                        }
-                    } catch (IllegalArgumentException e) {
-                        e.printStackTrace();
-                    } catch (IllegalAccessException e) {
-                        e.printStackTrace();
-                    }
-                }
-            }
-        }
-    }
-
-    public static boolean isMetadataField(String name) {
-        return known.contains(name);
-    }
-
-    public static boolean isMetadataField(Property property) {
-        return known.contains(property.getName());
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Knowns about all declared {@link Metadata} fields.
+ * Didn't find this functionality anywhere so it was added for
+ * ImageMetadataExtractor, but it can be generalized.
+ */
+public abstract class MetadataFields {
+
+    private static HashSet<String> known;
+
+    static {
+        known = new HashSet<String>();
+        setKnownForClass(TikaCoreProperties.class);
+        setKnownForClass(Metadata.class);
+    }
+
+    private static void setKnownForClass(Class<?> clazz) {
+        Field[] fields = clazz.getFields();
+        for (Field f : fields) {
+            int mod = f.getModifiers();
+            if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
+                Class<?> c = f.getType();
+                if (String.class.equals(c)) {
+                    try {
+                        String p = (String) f.get(null);
+                        if (p != null) {
+                            known.add(p);
+                        }
+                    } catch (IllegalArgumentException e) {
+                        e.printStackTrace();
+                    } catch (IllegalAccessException e) {
+                        e.printStackTrace();
+                    }
+                }
+                if (Property.class.isAssignableFrom(c)) {
+                    try {
+                        Property p = (Property) f.get(null);
+                        if (p != null) {
+                            known.add(p.getName());
+                        }
+                    } catch (IllegalArgumentException e) {
+                        e.printStackTrace();
+                    } catch (IllegalAccessException e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+    }
+
+    public static boolean isMetadataField(String name) {
+        return known.contains(name);
+    }
+
+    public static boolean isMetadataField(Property property) {
+        return known.contains(property.getName());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
index c98ce69..05dee1f 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class TiffParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -3941143576535464926L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MediaType.image("tiff"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            TikaInputStream tis = TikaInputStream.get(stream, tmp);
-            new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
-            new JempboxExtractor(metadata).parse(tis);
-        } finally {
-            tmp.dispose();
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TiffParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3941143576535464926L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.image("tiff"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
+            new JempboxExtractor(metadata).parse(tis);
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
index 247194e..7ec666c 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.jpeg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.image.ImageMetadataExtractor;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class JpegParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -1355028253756234603L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MediaType.image("jpeg"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            TikaInputStream tis = TikaInputStream.get(stream, tmp);
-            new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
-            new JempboxExtractor(metadata).parse(tis);
-        } finally {
-            tmp.dispose();
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JpegParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1355028253756234603L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.image("jpeg"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+            new JempboxExtractor(metadata).parse(tis);
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
index 03dc833..abc4235 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
@@ -1,252 +1,252 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
- * Currently, only the header is processed, not the raw audio data.
- */
-public class AudioFrame implements MP3Frame {
-    /** Constant for the MPEG version 1. */
-    public static final int MPEG_V1 = 3;
-
-    /** Constant for the MPEG version 2. */
-    public static final int MPEG_V2 = 2;
-
-    /** Constant for the MPEG version 2.5. */
-    public static final int MPEG_V2_5 = 0;
-
-    /** Constant for audio layer 1. */
-    public static final int LAYER_1 = 3;
-    
-    /** Constant for audio layer 2. */
-    public static final int LAYER_2 = 2;
-    
-    /** Constant for audio layer 3. */
-    public static final int LAYER_3 = 1;
-    
-    private final String version;
-    private final int versionCode;
-    private final int layer;
-    private final int sampleRate;
-    private final int channels;
-    private final int bitRate;
-    private final int length;
-    private final float duration;
-
-    public String getVersion() {
-        return version;
-    }
-
-    /**
-     * Get the sampling rate, in Hz
-     */
-    public int getSampleRate() {
-        return sampleRate;
-    }
-
-    /**
-     * Get the number of channels (1=mono, 2=stereo)
-     */
-    public int getChannels() {
-        return channels;
-    }
-
-    /**
-     * Get the version code.
-     * @return the version code (one of the {@code MPEG} constants)
-     */
-    public int getVersionCode()
-    {
-        return versionCode;
-    }
-
-    /**
-     * Get the audio layer code.
-     * @return the audio layer (one of the {@code LAYER} constants)
-     */
-    public int getLayer()
-    {
-        return layer;
-    }
-
-    /**
-     * Get the bit rate in bit per second.
-     * @return the bit rate
-     */
-    public int getBitRate()
-    {
-        return bitRate;
-    }
-
-    /**
-     * Returns the frame length in bytes.
-     * @return the frame length
-     */
-    public int getLength()
-    {
-        return length;
-    }
-
-    /**
-     * Returns the duration in milliseconds.
-     * @return the duration
-     */
-    public float getDuration()
-    {
-        return duration;
-    }
-
-    /**
-     * Does this appear to be a 4 byte audio frame header?
-     */
-    public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
-        if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
-            return false;
-        }
-        // Check for the magic 11 bits set at the start
-        // Note - doesn't do a CRC check
-        if (h1 == 0xff && (h2 & 0x60) == 0x60) {
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * @deprecated Use the constructor which is passed all values directly.
-     */
-    @Deprecated
-    public AudioFrame(InputStream stream, ContentHandler handler)
-            throws IOException, SAXException, TikaException {
-        this(-2, -2, -2, -2, stream);
-    }
-
-    /**
-     * @deprecated Use the constructor which is passed all values directly.
-     */
-    @Deprecated
-    public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
-            throws IOException {
-        if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
-            h1 = in.read();
-            h2 = in.read();
-            h3 = in.read();
-            h4 = in.read();
-        }
-
-        if (isAudioHeader(h1, h2, h3, h4)) {
-            layer = (h2 >> 1) & 0x03;
-            versionCode = (h2 >> 3) & 0x03;
-            version = generateVersionStr(versionCode, layer);
-
-            int rateCode = (h3 >> 2) & 0x03;
-            int rate;
-            switch (rateCode) {
-            case 0:
-                rate = 11025;
-                break;
-            case 1:
-                rate = 12000;
-                break;
-            default:
-                rate = 8000;
-            }
-            if (versionCode == MPEG_V2) {
-                rate *= 2;
-            } else if(versionCode == MPEG_V1) {
-                rate *= 4;
-            }
-            sampleRate = rate;
-
-            int chans = h4 & 0x192;
-            if (chans < 3) {
-                // Stereo, joint stereo, dual channel
-                channels = 2;
-            } else {
-                channels = 1;
-            }
-            bitRate = 0;
-            duration = 0;
-            length = 0;
-        } else {
-            throw new IllegalArgumentException("Magic Audio Frame Header not found");
-        }
-    }
-    
-    /**
-     * 
-     * Creates a new instance of {@code AudioFrame} and initializes all properties.
-     * @param mpegVersion the code for the MPEG version
-     * @param layer the code for the layer
-     * @param bitRate the bit rate (in bps)
-     * @param sampleRate the sample rate (in samples per second)
-     * @param channels the number of channels
-     * @param length the frame length (in bytes)
-     * @param duration the duration of this frame (in milliseconds)
-     */
-    public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
-            int channels, int length, float duration) {
-        versionCode = mpegVersion;
-        this.layer = layer;
-        this.bitRate = bitRate;
-        this.sampleRate = sampleRate;
-        this.channels = channels;
-        this.length = length;
-        this.duration = duration;
-        version = generateVersionStr(mpegVersion, layer);
-    }
-
-    /**
-     * Generates a string for the version of this audio frame.
-     * @param version the code for the MPEG version
-     * @param layer the code for the layer
-     * @return a string for the version
-     */
-    private static String generateVersionStr(int version, int layer) {
-        StringBuilder buf = new StringBuilder(64);
-        buf.append("MPEG 3 Layer ");
-        if (layer == LAYER_3) {
-            buf.append("III");
-        } else if (layer == LAYER_2) {
-            buf.append("II");
-        } else if (layer == LAYER_1) {
-            buf.append("I");
-        } else {
-            buf.append("(reserved)");
-        }
-
-        buf.append(" Version ");
-        if (version == MPEG_V2_5) {
-            buf.append("2.5");
-        } else if(version == MPEG_V2) {
-            buf.append("2");
-        } else if(version == MPEG_V1) {
-            buf.append("1");
-        } else {
-            buf.append("(reseved)");
-        }
-        
-        return buf.toString();
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+    /** Constant for the MPEG version 1. */
+    public static final int MPEG_V1 = 3;
+
+    /** Constant for the MPEG version 2. */
+    public static final int MPEG_V2 = 2;
+
+    /** Constant for the MPEG version 2.5. */
+    public static final int MPEG_V2_5 = 0;
+
+    /** Constant for audio layer 1. */
+    public static final int LAYER_1 = 3;
+    
+    /** Constant for audio layer 2. */
+    public static final int LAYER_2 = 2;
+    
+    /** Constant for audio layer 3. */
+    public static final int LAYER_3 = 1;
+    
+    private final String version;
+    private final int versionCode;
+    private final int layer;
+    private final int sampleRate;
+    private final int channels;
+    private final int bitRate;
+    private final int length;
+    private final float duration;
+
+    public String getVersion() {
+        return version;
+    }
+
+    /**
+     * Get the sampling rate, in Hz
+     */
+    public int getSampleRate() {
+        return sampleRate;
+    }
+
+    /**
+     * Get the number of channels (1=mono, 2=stereo)
+     */
+    public int getChannels() {
+        return channels;
+    }
+
+    /**
+     * Get the version code.
+     * @return the version code (one of the {@code MPEG} constants)
+     */
+    public int getVersionCode()
+    {
+        return versionCode;
+    }
+
+    /**
+     * Get the audio layer code.
+     * @return the audio layer (one of the {@code LAYER} constants)
+     */
+    public int getLayer()
+    {
+        return layer;
+    }
+
+    /**
+     * Get the bit rate in bit per second.
+     * @return the bit rate
+     */
+    public int getBitRate()
+    {
+        return bitRate;
+    }
+
+    /**
+     * Returns the frame length in bytes.
+     * @return the frame length
+     */
+    public int getLength()
+    {
+        return length;
+    }
+
+    /**
+     * Returns the duration in milliseconds.
+     * @return the duration
+     */
+    public float getDuration()
+    {
+        return duration;
+    }
+
+    /**
+     * Does this appear to be a 4 byte audio frame header?
+     */
+    public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+        if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+            return false;
+        }
+        // Check for the magic 11 bits set at the start
+        // Note - doesn't do a CRC check
+        if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * @deprecated Use the constructor which is passed all values directly.
+     */
+    @Deprecated
+    public AudioFrame(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(-2, -2, -2, -2, stream);
+    }
+
+    /**
+     * @deprecated Use the constructor which is passed all values directly.
+     */
+    @Deprecated
+    public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+            throws IOException {
+        if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+            h1 = in.read();
+            h2 = in.read();
+            h3 = in.read();
+            h4 = in.read();
+        }
+
+        if (isAudioHeader(h1, h2, h3, h4)) {
+            layer = (h2 >> 1) & 0x03;
+            versionCode = (h2 >> 3) & 0x03;
+            version = generateVersionStr(versionCode, layer);
+
+            int rateCode = (h3 >> 2) & 0x03;
+            int rate;
+            switch (rateCode) {
+            case 0:
+                rate = 11025;
+                break;
+            case 1:
+                rate = 12000;
+                break;
+            default:
+                rate = 8000;
+            }
+            if (versionCode == MPEG_V2) {
+                rate *= 2;
+            } else if(versionCode == MPEG_V1) {
+                rate *= 4;
+            }
+            sampleRate = rate;
+
+            int chans = h4 & 0x192;
+            if (chans < 3) {
+                // Stereo, joint stereo, dual channel
+                channels = 2;
+            } else {
+                channels = 1;
+            }
+            bitRate = 0;
+            duration = 0;
+            length = 0;
+        } else {
+            throw new IllegalArgumentException("Magic Audio Frame Header not found");
+        }
+    }
+    
+    /**
+     * 
+     * Creates a new instance of {@code AudioFrame} and initializes all properties.
+     * @param mpegVersion the code for the MPEG version
+     * @param layer the code for the layer
+     * @param bitRate the bit rate (in bps)
+     * @param sampleRate the sample rate (in samples per second)
+     * @param channels the number of channels
+     * @param length the frame length (in bytes)
+     * @param duration the duration of this frame (in milliseconds)
+     */
+    public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
+            int channels, int length, float duration) {
+        versionCode = mpegVersion;
+        this.layer = layer;
+        this.bitRate = bitRate;
+        this.sampleRate = sampleRate;
+        this.channels = channels;
+        this.length = length;
+        this.duration = duration;
+        version = generateVersionStr(mpegVersion, layer);
+    }
+
+    /**
+     * Generates a string for the version of this audio frame.
+     * @param version the code for the MPEG version
+     * @param layer the code for the layer
+     * @return a string for the version
+     */
+    private static String generateVersionStr(int version, int layer) {
+        StringBuilder buf = new StringBuilder(64);
+        buf.append("MPEG 3 Layer ");
+        if (layer == LAYER_3) {
+            buf.append("III");
+        } else if (layer == LAYER_2) {
+            buf.append("II");
+        } else if (layer == LAYER_1) {
+            buf.append("I");
+        } else {
+            buf.append("(reserved)");
+        }
+
+        buf.append(" Version ");
+        if (version == MPEG_V2_5) {
+            buf.append("2.5");
+        } else if(version == MPEG_V2) {
+            buf.append("2");
+        } else if(version == MPEG_V1) {
+            buf.append("1");
+        } else {
+            buf.append("(reseved)");
+        }
+        
+        return buf.toString();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
index 6f20c3c..b7d2d75 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
@@ -1,142 +1,142 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Takes an array of {@link ID3Tags} in preference order, and when asked for
- * a given tag, will return it from the first {@link ID3Tags} that has it.
- */
-public class CompositeTagHandler implements ID3Tags {
-
-    private ID3Tags[] tags;
-
-    public CompositeTagHandler(ID3Tags[] tags) {
-        this.tags = tags;
-    }
-
-    public boolean getTagsPresent() {
-        for (ID3Tags tag : tags) {
-            if (tag.getTagsPresent()) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    public String getTitle() {
-        for (ID3Tags tag : tags) {
-            if (tag.getTitle() != null) {
-                return tag.getTitle();
-            }
-        }
-        return null;
-    }
-
-    public String getArtist() {
-        for (ID3Tags tag : tags) {
-            if (tag.getArtist() != null) {
-                return tag.getArtist();
-            }
-        }
-        return null;
-    }
-
-    public String getAlbum() {
-        for (ID3Tags tag : tags) {
-            if (tag.getAlbum() != null) {
-                return tag.getAlbum();
-            }
-        }
-        return null;
-    }
-
-    public String getComposer() {
-        for (ID3Tags tag : tags) {
-            if (tag.getComposer() != null) {
-                return tag.getComposer();
-            }
-        }
-        return null;
-    }
-
-    public String getYear() {
-        for (ID3Tags tag : tags) {
-            if (tag.getYear() != null) {
-                return tag.getYear();
-            }
-        }
-        return null;
-    }
-
-    public List<ID3Comment> getComments() {
-        for (ID3Tags tag : tags) {
-            List<ID3Comment> comments = tag.getComments();
-            if (comments != null && comments.size() > 0) {
-                return comments;
-            }
-        }
-        return Collections.emptyList();
-    }
-
-    public String getGenre() {
-        for (ID3Tags tag : tags) {
-            if (tag.getGenre() != null) {
-                return tag.getGenre();
-            }
-        }
-        return null;
-    }
-
-    public String getTrackNumber() {
-        for (ID3Tags tag : tags) {
-            if (tag.getTrackNumber() != null) {
-                return tag.getTrackNumber();
-            }
-        }
-        return null;
-    }
-
-    public String getAlbumArtist() {
-        for (ID3Tags tag : tags) {
-            if (tag.getAlbumArtist() != null) {
-                return tag.getAlbumArtist();
-            }
-        }
-        return null;
-    }
-
-    public String getDisc() {
-        for (ID3Tags tag : tags) {
-            if (tag.getDisc() != null) {
-                return tag.getDisc();
-            }
-        }
-        return null;
-    }
-
-    public String getCompilation() {
-        for (ID3Tags tag : tags) {
-            if (tag.getCompilation() != null) {
-                return tag.getCompilation();
-            }
-        }
-        return null;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+    private ID3Tags[] tags;
+
+    public CompositeTagHandler(ID3Tags[] tags) {
+        this.tags = tags;
+    }
+
+    public boolean getTagsPresent() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTagsPresent()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public String getTitle() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTitle() != null) {
+                return tag.getTitle();
+            }
+        }
+        return null;
+    }
+
+    public String getArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getArtist() != null) {
+                return tag.getArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbum() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbum() != null) {
+                return tag.getAlbum();
+            }
+        }
+        return null;
+    }
+
+    public String getComposer() {
+        for (ID3Tags tag : tags) {
+            if (tag.getComposer() != null) {
+                return tag.getComposer();
+            }
+        }
+        return null;
+    }
+
+    public String getYear() {
+        for (ID3Tags tag : tags) {
+            if (tag.getYear() != null) {
+                return tag.getYear();
+            }
+        }
+        return null;
+    }
+
+    public List<ID3Comment> getComments() {
+        for (ID3Tags tag : tags) {
+            List<ID3Comment> comments = tag.getComments();
+            if (comments != null && comments.size() > 0) {
+                return comments;
+            }
+        }
+        return Collections.emptyList();
+    }
+
+    public String getGenre() {
+        for (ID3Tags tag : tags) {
+            if (tag.getGenre() != null) {
+                return tag.getGenre();
+            }
+        }
+        return null;
+    }
+
+    public String getTrackNumber() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTrackNumber() != null) {
+                return tag.getTrackNumber();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbumArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbumArtist() != null) {
+                return tag.getAlbumArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getDisc() {
+        for (ID3Tags tag : tags) {
+            if (tag.getDisc() != null) {
+                return tag.getDisc();
+            }
+        }
+        return null;
+    }
+
+    public String getCompilation() {
+        for (ID3Tags tag : tags) {
+            if (tag.getCompilation() != null) {
+                return tag.getCompilation();
+            }
+        }
+        return null;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
index 6ee19db..98ef504 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
@@ -1,254 +1,254 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.util.List;
-
-/**
- * Interface that defines the common interface for ID3 tag parsers,
- *  such as ID3v1 and ID3v2.3.
- * Implementations should return NULL if the file lacks a given
- *  tag, or if the tag isn't defined for the version.
- *  
- * Note that so far, only the ID3v1 core tags are listed here. In
- *  future, we may wish to add more to cover the extra tags that
- *  our ID3v2 handlers can produce.
- */
-public interface ID3Tags {
-    /**
-     * List of predefined genres.
-     *
-     * @see http://www.id3.org/id3v2-00
-     */
-    String[] GENRES = new String[] {
-        /*  0 */ "Blues",
-        /*  1 */ "Classic Rock",
-        /*  2 */ "Country",
-        /*  3 */ "Dance",
-        /*  4 */ "Disco",
-        /*  5 */ "Funk",
-        /*  6 */ "Grunge",
-        /*  7 */ "Hip-Hop",
-        /*  8 */ "Jazz",
-        /*  9 */ "Metal",
-        /* 10 */ "New Age",
-        /* 11 */ "Oldies",
-        /* 12 */ "Other",
-        /* 13 */ "Pop",
-        /* 14 */ "R&B",
-        /* 15 */ "Rap",
-        /* 16 */ "Reggae",
-        /* 17 */ "Rock",
-        /* 18 */ "Techno",
-        /* 19 */ "Industrial",
-        /* 20 */ "Alternative",
-        /* 21 */ "Ska",
-        /* 22 */ "Death Metal",
-        /* 23 */ "Pranks",
-        /* 24 */ "Soundtrack",
-        /* 25 */ "Euro-Techno",
-        /* 26 */ "Ambient",
-        /* 27 */ "Trip-Hop",
-        /* 28 */ "Vocal",
-        /* 29 */ "Jazz+Funk",
-        /* 30 */ "Fusion",
-        /* 31 */ "Trance",
-        /* 32 */ "Classical",
-        /* 33 */ "Instrumental",
-        /* 34 */ "Acid",
-        /* 35 */ "House",
-        /* 36 */ "Game",
-        /* 37 */ "Sound Clip",
-        /* 38 */ "Gospel",
-        /* 39 */ "Noise",
-        /* 40 */ "AlternRock",
-        /* 41 */ "Bass",
-        /* 42 */ "Soul",
-        /* 43 */ "Punk",
-        /* 44 */ "Space",
-        /* 45 */ "Meditative",
-        /* 46 */ "Instrumental Pop",
-        /* 47 */ "Instrumental Rock",
-        /* 48 */ "Ethnic",
-        /* 49 */ "Gothic",
-        /* 50 */ "Darkwave",
-        /* 51 */ "Techno-Industrial",
-        /* 52 */ "Electronic",
-        /* 53 */ "Pop-Folk",
-        /* 54 */ "Eurodance",
-        /* 55 */ "Dream",
-        /* 56 */ "Southern Rock",
-        /* 57 */ "Comedy",
-        /* 58 */ "Cult",
-        /* 59 */ "Gangsta",
-        /* 60 */ "Top 40",
-        /* 61 */ "Christian Rap",
-        /* 62 */ "Pop/Funk",
-        /* 63 */ "Jungle",
-        /* 64 */ "Native American",
-        /* 65 */ "Cabaret",
-        /* 66 */ "New Wave",
-        /* 67 */ "Psychadelic",
-        /* 68 */ "Rave",
-        /* 69 */ "Showtunes",
-        /* 70 */ "Trailer",
-        /* 71 */ "Lo-Fi",
-        /* 72 */ "Tribal",
-        /* 73 */ "Acid Punk",
-        /* 74 */ "Acid Jazz",
-        /* 75 */ "Polka",
-        /* 76 */ "Retro",
-        /* 77 */ "Musical",
-        /* 78 */ "Rock & Roll",
-        /* 79 */ "Hard Rock",
-        /* 80 */ "Folk",
-        /* 81 */ "Folk-Rock",
-        /* 82 */ "National Folk",
-        /* 83 */ "Swing",
-        /* 84 */ "Fast Fusion",
-        /* 85 */ "Bebob",
-        /* 86 */ "Latin",
-        /* 87 */ "Revival",
-        /* 88 */ "Celtic",
-        /* 89 */ "Bluegrass",
-        /* 90 */ "Avantgarde",
-        /* 91 */ "Gothic Rock",
-        /* 92 */ "Progressive Rock",
-        /* 93 */ "Psychedelic Rock",
-        /* 94 */ "Symphonic Rock",
-        /* 95 */ "Slow Rock",
-        /* 96 */ "Big Band",
-        /* 97 */ "Chorus",
-        /* 98 */ "Easy Listening",
-        /* 99 */ "Acoustic",
-        /* 100 */ "Humour",
-        /* 101 */ "Speech",
-        /* 102 */ "Chanson",
-        /* 103 */ "Opera",
-        /* 104 */ "Chamber Music",
-        /* 105 */ "Sonata",
-        /* 106 */ "Symphony",
-        /* 107 */ "Booty Bass",
-        /* 108 */ "Primus",
-        /* 109 */ "Porn Groove",
-        /* 110 */ "Satire",
-        /* 111 */ "Slow Jam",
-        /* 112 */ "Club",
-        /* 113 */ "Tango",
-        /* 114 */ "Samba",
-        /* 115 */ "Folklore",
-        /* 116 */ "Ballad",
-        /* 117 */ "Power Ballad",
-        /* 118 */ "Rhythmic Soul",
-        /* 119 */ "Freestyle",
-        /* 120 */ "Duet",
-        /* 121 */ "Punk Rock",
-        /* 122 */ "Drum Solo",
-        /* 123 */ "A capella",
-        /* 124 */ "Euro-House",
-        /* 125 */ "Dance Hall",
-        /* sentinel */ ""
-    };
-
-    /**
-     * Does the file contain this kind of tags?
-     */
-    boolean getTagsPresent();
-
-    String getTitle();
-
-    /**
-     * The Artist for the track
-     */
-    String getArtist();
-
-    /**
-     * The Artist for the overall album / compilation of albums
-     */
-    String getAlbumArtist();
-
-    String getAlbum();
-    
-    String getComposer();
-
-    String getCompilation();
-    
-    /**
-     * Retrieves the comments, if any.
-     * Files may have more than one comment, but normally only 
-     *  one with any language/description pair.
-     */
-    List<ID3Comment> getComments();
-
-    String getGenre();
-
-    String getYear();
-
-    /**
-     * The number of the track within the album / recording
-     */
-    String getTrackNumber();
-
-    /**
-     * The number of the disc this belongs to, within the set
-     */
-    String getDisc();
-
-    /**
-     * Represents a comments in ID3 (especially ID3 v2), where are 
-     *  made up of several parts
-     */
-    public static class ID3Comment {
-        private String language;
-        private String description;
-        private String text;
-        
-        /**
-         * Creates an ID3 v1 style comment tag
-         */
-        public ID3Comment(String id3v1Text) {
-           this.text = id3v1Text;
-        }
-        /**
-         * Creates an ID3 v2 style comment tag
-         */
-        public ID3Comment(String language, String description, String text) {
-            this.language = language;
-            this.description = description;
-            this.text = text;
-        }
-
-        /**
-         * Gets the language, if present
-         */
-        public String getLanguage() {
-           return language;
-        }
-        /**
-         * Gets the description, if present
-         */
-        public String getDescription() {
-           return description;
-        }
-        /**
-         * Gets the text, if present
-         */
-        public String getText() {
-           return text;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.List;
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ *  such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ *  tag, or if the tag isn't defined for the version.
+ *  
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ *  future, we may wish to add more to cover the extra tags that
+ *  our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+    /**
+     * List of predefined genres.
+     *
+     * @see http://www.id3.org/id3v2-00
+     */
+    String[] GENRES = new String[] {
+        /*  0 */ "Blues",
+        /*  1 */ "Classic Rock",
+        /*  2 */ "Country",
+        /*  3 */ "Dance",
+        /*  4 */ "Disco",
+        /*  5 */ "Funk",
+        /*  6 */ "Grunge",
+        /*  7 */ "Hip-Hop",
+        /*  8 */ "Jazz",
+        /*  9 */ "Metal",
+        /* 10 */ "New Age",
+        /* 11 */ "Oldies",
+        /* 12 */ "Other",
+        /* 13 */ "Pop",
+        /* 14 */ "R&B",
+        /* 15 */ "Rap",
+        /* 16 */ "Reggae",
+        /* 17 */ "Rock",
+        /* 18 */ "Techno",
+        /* 19 */ "Industrial",
+        /* 20 */ "Alternative",
+        /* 21 */ "Ska",
+        /* 22 */ "Death Metal",
+        /* 23 */ "Pranks",
+        /* 24 */ "Soundtrack",
+        /* 25 */ "Euro-Techno",
+        /* 26 */ "Ambient",
+        /* 27 */ "Trip-Hop",
+        /* 28 */ "Vocal",
+        /* 29 */ "Jazz+Funk",
+        /* 30 */ "Fusion",
+        /* 31 */ "Trance",
+        /* 32 */ "Classical",
+        /* 33 */ "Instrumental",
+        /* 34 */ "Acid",
+        /* 35 */ "House",
+        /* 36 */ "Game",
+        /* 37 */ "Sound Clip",
+        /* 38 */ "Gospel",
+        /* 39 */ "Noise",
+        /* 40 */ "AlternRock",
+        /* 41 */ "Bass",
+        /* 42 */ "Soul",
+        /* 43 */ "Punk",
+        /* 44 */ "Space",
+        /* 45 */ "Meditative",
+        /* 46 */ "Instrumental Pop",
+        /* 47 */ "Instrumental Rock",
+        /* 48 */ "Ethnic",
+        /* 49 */ "Gothic",
+        /* 50 */ "Darkwave",
+        /* 51 */ "Techno-Industrial",
+        /* 52 */ "Electronic",
+        /* 53 */ "Pop-Folk",
+        /* 54 */ "Eurodance",
+        /* 55 */ "Dream",
+        /* 56 */ "Southern Rock",
+        /* 57 */ "Comedy",
+        /* 58 */ "Cult",
+        /* 59 */ "Gangsta",
+        /* 60 */ "Top 40",
+        /* 61 */ "Christian Rap",
+        /* 62 */ "Pop/Funk",
+        /* 63 */ "Jungle",
+        /* 64 */ "Native American",
+        /* 65 */ "Cabaret",
+        /* 66 */ "New Wave",
+        /* 67 */ "Psychadelic",
+        /* 68 */ "Rave",
+        /* 69 */ "Showtunes",
+        /* 70 */ "Trailer",
+        /* 71 */ "Lo-Fi",
+        /* 72 */ "Tribal",
+        /* 73 */ "Acid Punk",
+        /* 74 */ "Acid Jazz",
+        /* 75 */ "Polka",
+        /* 76 */ "Retro",
+        /* 77 */ "Musical",
+        /* 78 */ "Rock & Roll",
+        /* 79 */ "Hard Rock",
+        /* 80 */ "Folk",
+        /* 81 */ "Folk-Rock",
+        /* 82 */ "National Folk",
+        /* 83 */ "Swing",
+        /* 84 */ "Fast Fusion",
+        /* 85 */ "Bebob",
+        /* 86 */ "Latin",
+        /* 87 */ "Revival",
+        /* 88 */ "Celtic",
+        /* 89 */ "Bluegrass",
+        /* 90 */ "Avantgarde",
+        /* 91 */ "Gothic Rock",
+        /* 92 */ "Progressive Rock",
+        /* 93 */ "Psychedelic Rock",
+        /* 94 */ "Symphonic Rock",
+        /* 95 */ "Slow Rock",
+        /* 96 */ "Big Band",
+        /* 97 */ "Chorus",
+        /* 98 */ "Easy Listening",
+        /* 99 */ "Acoustic",
+        /* 100 */ "Humour",
+        /* 101 */ "Speech",
+        /* 102 */ "Chanson",
+        /* 103 */ "Opera",
+        /* 104 */ "Chamber Music",
+        /* 105 */ "Sonata",
+        /* 106 */ "Symphony",
+        /* 107 */ "Booty Bass",
+        /* 108 */ "Primus",
+        /* 109 */ "Porn Groove",
+        /* 110 */ "Satire",
+        /* 111 */ "Slow Jam",
+        /* 112 */ "Club",
+        /* 113 */ "Tango",
+        /* 114 */ "Samba",
+        /* 115 */ "Folklore",
+        /* 116 */ "Ballad",
+        /* 117 */ "Power Ballad",
+        /* 118 */ "Rhythmic Soul",
+        /* 119 */ "Freestyle",
+        /* 120 */ "Duet",
+        /* 121 */ "Punk Rock",
+        /* 122 */ "Drum Solo",
+        /* 123 */ "A capella",
+        /* 124 */ "Euro-House",
+        /* 125 */ "Dance Hall",
+        /* sentinel */ ""
+    };
+
+    /**
+     * Does the file contain this kind of tags?
+     */
+    boolean getTagsPresent();
+
+    String getTitle();
+
+    /**
+     * The Artist for the track
+     */
+    String getArtist();
+
+    /**
+     * The Artist for the overall album / compilation of albums
+     */
+    String getAlbumArtist();
+
+    String getAlbum();
+    
+    String getComposer();
+
+    String getCompilation();
+    
+    /**
+     * Retrieves the comments, if any.
+     * Files may have more than one comment, but normally only 
+     *  one with any language/description pair.
+     */
+    List<ID3Comment> getComments();
+
+    String getGenre();
+
+    String getYear();
+
+    /**
+     * The number of the track within the album / recording
+     */
+    String getTrackNumber();
+
+    /**
+     * The number of the disc this belongs to, within the set
+     */
+    String getDisc();
+
+    /**
+     * Represents a comments in ID3 (especially ID3 v2), where are 
+     *  made up of several parts
+     */
+    public static class ID3Comment {
+        private String language;
+        private String description;
+        private String text;
+        
+        /**
+         * Creates an ID3 v1 style comment tag
+         */
+        public ID3Comment(String id3v1Text) {
+           this.text = id3v1Text;
+        }
+        /**
+         * Creates an ID3 v2 style comment tag
+         */
+        public ID3Comment(String language, String description, String text) {
+            this.language = language;
+            this.description = description;
+            this.text = text;
+        }
+
+        /**
+         * Gets the language, if present
+         */
+        public String getLanguage() {
+           return language;
+        }
+        /**
+         * Gets the description, if present
+         */
+        public String getDescription() {
+           return description;
+        }
+        /**
+         * Gets the text, if present
+         */
+        public String getText() {
+           return text;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
index 4d41fa3..2111356 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
@@ -1,183 +1,183 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-/**
- * This is used to parse ID3 Version 1 Tag information from an MP3 file, 
- * if available.
- *
- * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
- */
-public class ID3v1Handler implements ID3Tags {
-    private String title;
-    private String artist;
-    private String album;
-    private String year;
-    private ID3Comment comment;
-    private String genre;
-    private String trackNumber;
-
-    boolean found = false;
-
-    public ID3v1Handler(InputStream stream, ContentHandler handler)
-            throws IOException, SAXException, TikaException {
-        this(LyricsHandler.getSuffix(stream, 128));
-    }
-
-    /**
-     * Creates from the last 128 bytes of a stream.
-     * @param tagData Must be the last 128 bytes 
-     */
-    protected ID3v1Handler(byte[] tagData)
-            throws IOException, SAXException, TikaException {
-        if (tagData.length == 128
-                && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
-            found = true;
-
-            title = getString(tagData, 3, 33);
-            artist = getString(tagData, 33, 63);
-            album = getString(tagData, 63, 93);
-            year = getString(tagData, 93, 97);
-            
-            String commentStr = getString(tagData, 97, 127);
-            comment = new ID3Comment(commentStr);
-
-            int genreID = (int) tagData[127] & 0xff; // unsigned byte
-            genre = GENRES[Math.min(genreID, GENRES.length - 1)];
-
-            // ID3v1.1 Track addition
-            // If the last two bytes of the comment field are zero and
-            // non-zero, then the last byte is the track number
-            if (tagData[125] == 0 && tagData[126] != 0) {
-                int trackNum = (int) tagData[126] & 0xff;
-                trackNumber = Integer.toString(trackNum);
-            }
-        }
-    }
-
-
-    public boolean getTagsPresent() {
-        return found;
-    }
-
-    public String getTitle() {
-        return title;
-    }
-
-    public String getArtist() {
-        return artist;
-    }
-
-    public String getAlbum() {
-        return album;
-    }
-
-    public String getYear() {
-        return year;
-    }
-
-    public List<ID3Comment> getComments() {
-       return Arrays.asList(comment);
-    }
-
-    public String getGenre() {
-        return genre;
-    }
-
-    public String getTrackNumber() {
-        return trackNumber;
-    }
-    
-    /**
-     * ID3v1 doesn't have composers,
-     *  so returns null;
-     */
-    public String getComposer() {
-        return null;
-    }
-
-    /**
-     * ID3v1 doesn't have album-wide artists,
-     *  so returns null;
-     */
-    public String getAlbumArtist() {
-        return null;
-    }
-
-    /**
-     * ID3v1 doesn't have disc numbers,
-     *  so returns null;
-     */
-    public String getDisc() {
-        return null;
-    }
-
-    /**
-     * ID3v1 doesn't have compilations,
-     *  so returns null;
-     */
-    public String getCompilation() {
-        return null;
-    }
-
-    /**
-     * Returns the identified ISO-8859-1 substring from the given byte buffer.
-     * The return value is the zero-terminated substring retrieved from
-     * between the given start and end positions in the given byte buffer.
-     * Extra whitespace (and control characters) from the beginning and the
-     * end of the substring is removed.
-     *
-     * @param buffer byte buffer
-     * @param start start index of the substring
-     * @param end end index of the substring
-     * @return the identified substring
-     * @throws TikaException if the ISO-8859-1 encoding is not available
-     */
-    private static String getString(byte[] buffer, int start, int end)
-            throws TikaException {
-        // Find the zero byte that marks the end of the string
-        int zero = start;
-        while (zero < end && buffer[zero] != 0) {
-            zero++;
-        }
-
-        // Skip trailing whitespace
-        end = zero;
-        while (start < end && buffer[end - 1] <= ' ') {
-            end--;
-        }
-
-        // Skip leading whitespace
-        while (start < end && buffer[start] <= ' ') {
-            start++;
-        }
-
-        // Return the remaining substring
-        return new String(buffer, start, end - start, ISO_8859_1);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file, 
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private ID3Comment comment;
+    private String genre;
+    private String trackNumber;
+
+    boolean found = false;
+
+    public ID3v1Handler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(LyricsHandler.getSuffix(stream, 128));
+    }
+
+    /**
+     * Creates from the last 128 bytes of a stream.
+     * @param tagData Must be the last 128 bytes 
+     */
+    protected ID3v1Handler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if (tagData.length == 128
+                && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+            found = true;
+
+            title = getString(tagData, 3, 33);
+            artist = getString(tagData, 33, 63);
+            album = getString(tagData, 63, 93);
+            year = getString(tagData, 93, 97);
+            
+            String commentStr = getString(tagData, 97, 127);
+            comment = new ID3Comment(commentStr);
+
+            int genreID = (int) tagData[127] & 0xff; // unsigned byte
+            genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+            // ID3v1.1 Track addition
+            // If the last two bytes of the comment field are zero and
+            // non-zero, then the last byte is the track number
+            if (tagData[125] == 0 && tagData[126] != 0) {
+                int trackNum = (int) tagData[126] & 0xff;
+                trackNumber = Integer.toString(trackNum);
+            }
+        }
+    }
+
+
+    public boolean getTagsPresent() {
+        return found;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public List<ID3Comment> getComments() {
+       return Arrays.asList(comment);
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+    
+    /**
+     * ID3v1 doesn't have composers,
+     *  so returns null;
+     */
+    public String getComposer() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have album-wide artists,
+     *  so returns null;
+     */
+    public String getAlbumArtist() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have disc numbers,
+     *  so returns null;
+     */
+    public String getDisc() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    /**
+     * Returns the identified ISO-8859-1 substring from the given byte buffer.
+     * The return value is the zero-terminated substring retrieved from
+     * between the given start and end positions in the given byte buffer.
+     * Extra whitespace (and control characters) from the beginning and the
+     * end of the substring is removed.
+     *
+     * @param buffer byte buffer
+     * @param start start index of the substring
+     * @param end end index of the substring
+     * @return the identified substring
+     * @throws TikaException if the ISO-8859-1 encoding is not available
+     */
+    private static String getString(byte[] buffer, int start, int end)
+            throws TikaException {
+        // Find the zero byte that marks the end of the string
+        int zero = start;
+        while (zero < end && buffer[zero] != 0) {
+            zero++;
+        }
+
+        // Skip trailing whitespace
+        end = zero;
+        while (start < end && buffer[end - 1] <= ' ') {
+            end--;
+        }
+
+        // Skip leading whitespace
+        while (start < end && buffer[start] <= ' ') {
+            start++;
+        }
+
+        // Return the remaining substring
+        return new String(buffer, start, end - start, ISO_8859_1);
+    }
+}

[39/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

Convert new lines from windows to unix


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c7a6bcac
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c7a6bcac
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c7a6bcac

Branch: refs/heads/2.x
Commit: c7a6bcac422f10d130399f6ff5446e24c3f50ac5
Parents: dd3c2a4
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 07:10:47 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 07:10:53 2016 -0400

----------------------------------------------------------------------
 .../ConfigurableThreadPoolExecutor.java         |   64 +-
 .../concurrent/SimpleThreadPoolExecutor.java    |   80 +-
 .../apache/tika/detect/AbstractDetector.java    |   86 +-
 .../org/apache/tika/detect/DetectorProxy.java   |  134 +-
 .../tika/detect/EncodingDetectorProxy.java      |   82 +-
 .../java/org/apache/tika/io/StringUtil.java     |  242 +-
 .../tika/osgi/TikaAbstractBundleActivator.java  |  142 +-
 .../java/org/apache/tika/osgi/TikaService.java  |   50 +-
 .../tika/osgi/internal/TikaServiceImpl.java     |  162 +-
 .../org/apache/tika/parser/ParserProxy.java     |  148 +-
 .../org/apache/tika/utils/ConcurrentUtils.java  |  114 +-
 .../org/apache/tika/config/DummyExecutor.java   |   60 +-
 .../apache/tika/detect/DetectorProxyTest.java   |  112 +-
 .../apache/tika/detect/DummyProxyDetector.java  |   62 +-
 .../apache/tika/parser/DummyProxyParser.java    |   88 +-
 .../org/apache/tika/parser/ParserProxyTest.java |  130 +-
 .../apache/tika/utils/ConcurrentUtilsTest.java  |  126 +-
 .../services/org.apache.tika.parser.Parser      |   34 +-
 .../apache/tika/config/TIKA-1762-executors.xml  |   56 +-
 tika-parser-bundles/pom.xml                     |  350 +--
 .../tika-parser-advanced-bundle/pom.xml         |  162 +-
 .../tika-parser-cad-bundle/pom.xml              |  144 +-
 .../tika-parser-code-bundle/pom.xml             |  148 +-
 .../tika-parser-crypto-bundle/pom.xml           |  156 +-
 .../tika-parser-database-bundle/pom.xml         |  134 +-
 .../tika-parser-ebook-bundle/pom.xml            |  142 +-
 .../tika-parser-journal-bundle/pom.xml          |  158 +-
 .../tika-parser-multimedia-bundle/pom.xml       |  168 +-
 .../tika-parser-office-bundle/pom.xml           |  280 +-
 .../tika-parser-package-bundle/pom.xml          |  158 +-
 .../tika-parser-pdf-bundle/pom.xml              |  216 +-
 .../tika-parser-scientific-bundle/pom.xml       |  402 +--
 .../tika-parser-text-bundle/pom.xml             |  156 +-
 .../tika-parser-web-bundle/pom.xml              |  184 +-
 tika-parser-modules/pom.xml                     |  410 +--
 .../tika-parser-advanced-module/pom.xml         |  136 +-
 .../module/advanced/internal/Activator.java     |   72 +-
 .../tika-parser-cad-module/pom.xml              |  110 +-
 .../tika/module/cad/internal/Activator.java     |   72 +-
 .../org/apache/tika/parser/dwg/DWGParser.java   |  712 ++---
 .../tika-parser-code-module/pom.xml             |  136 +-
 .../tika/module/code/internal/Activator.java    |   72 +-
 .../org/apache/tika/parser/asm/ClassParser.java |  108 +-
 .../tika/parser/asm/XHTMLClassVisitor.java      |  646 ++--
 .../tika/parser/code/SourceCodeParser.java      |  284 +-
 .../apache/tika/parser/asm/ClassParserTest.java |  118 +-
 .../tika/parser/code/SourceCodeParserTest.java  |  202 +-
 .../tika-parser-crypto-module/pom.xml           |  104 +-
 .../tika/module/crypto/internal/Activator.java  |   72 +-
 .../tika/parser/crypto/Pkcs7ParserTest.java     |   94 +-
 .../tika-parser-database-module/pom.xml         |  132 +-
 .../module/database/internal/Activator.java     |   72 +-
 .../tika-parser-ebook-module/pom.xml            |   94 +-
 .../tika/module/ebook/internal/Activator.java   |   72 +-
 .../tika/parser/epub/EpubContentParser.java     |  118 +-
 .../org/apache/tika/parser/epub/EpubParser.java |  238 +-
 .../apache/tika/parser/epub/EpubParserTest.java |  116 +-
 .../tika-parser-journal-module/pom.xml          |  134 +-
 .../tika/module/journal/internal/Activator.java |   72 +-
 .../tika-parser-multimedia-module/pom.xml       |  200 +-
 .../module/multimedia/internal/Activator.java   |   72 +-
 .../apache/tika/parser/audio/AudioParser.java   |  278 +-
 .../apache/tika/parser/audio/MidiParser.java    |  242 +-
 .../apache/tika/parser/font/TrueTypeParser.java |  222 +-
 .../parser/image/ImageMetadataExtractor.java    | 1124 +++----
 .../apache/tika/parser/image/ImageParser.java   |  406 +--
 .../tika/parser/image/MetadataFields.java       |  168 +-
 .../apache/tika/parser/image/TiffParser.java    |  136 +-
 .../org/apache/tika/parser/jpeg/JpegParser.java |  138 +-
 .../org/apache/tika/parser/mp3/AudioFrame.java  |  504 ++--
 .../tika/parser/mp3/CompositeTagHandler.java    |  284 +-
 .../org/apache/tika/parser/mp3/ID3Tags.java     |  508 ++--
 .../apache/tika/parser/mp3/ID3v1Handler.java    |  366 +--
 .../apache/tika/parser/mp3/ID3v22Handler.java   |  318 +-
 .../apache/tika/parser/mp3/ID3v23Handler.java   |  276 +-
 .../apache/tika/parser/mp3/ID3v24Handler.java   |  286 +-
 .../org/apache/tika/parser/mp3/ID3v2Frame.java  |  848 +++---
 .../apache/tika/parser/mp3/LyricsHandler.java   |  312 +-
 .../org/apache/tika/parser/mp3/MP3Frame.java    |   50 +-
 .../org/apache/tika/parser/mp3/Mp3Parser.java   |  492 +--
 .../org/apache/tika/parser/video/FLVParser.java |  536 ++--
 .../parser/ocr/TesseractOCRConfig.properties    |   40 +-
 .../tika/parser/audio/AudioParserTest.java      |  150 +-
 .../tika/parser/audio/MidiParserTest.java       |   84 +-
 .../image/ImageMetadataExtractorTest.java       |  278 +-
 .../tika/parser/image/ImageParserTest.java      |  324 +-
 .../tika/parser/image/MetadataFieldsTest.java   |   72 +-
 .../tika/parser/image/TiffParserTest.java       |  132 +-
 .../apache/tika/parser/jpeg/JpegParserTest.java |  568 ++--
 .../apache/tika/parser/mp3/Mp3ParserTest.java   |  828 ++---
 .../tika/parser/ocr/TesseractOCRConfigTest.java |  184 +-
 .../apache/tika/parser/video/FLVParserTest.java |   88 +-
 .../tika-parser-office-module/pom.xml           |  250 +-
 .../tika/module/office/internal/Activator.java  |   72 +-
 .../org/apache/tika/parser/chm/ChmParser.java   |  224 +-
 .../tika/parser/chm/accessor/ChmAccessor.java   |   78 +-
 .../chm/accessor/ChmDirectoryListingSet.java    |  796 ++---
 .../tika/parser/chm/accessor/ChmItsfHeader.java |  984 +++---
 .../tika/parser/chm/accessor/ChmItspHeader.java | 1096 +++----
 .../parser/chm/accessor/ChmLzxcControlData.java |  638 ++--
 .../parser/chm/accessor/ChmLzxcResetTable.java  |  682 ++---
 .../tika/parser/chm/accessor/ChmPmgiHeader.java |  352 +--
 .../tika/parser/chm/accessor/ChmPmglHeader.java |  412 +--
 .../chm/accessor/DirectoryListingEntry.java     |  302 +-
 .../tika/parser/chm/assertion/ChmAssert.java    |  338 +--
 .../apache/tika/parser/chm/core/ChmCommons.java |  722 ++---
 .../tika/parser/chm/core/ChmConstants.java      |  204 +-
 .../tika/parser/chm/core/ChmExtractor.java      |  784 ++---
 .../apache/tika/parser/chm/core/ChmWrapper.java |  294 +-
 .../chm/exception/ChmParsingException.java      |   54 +-
 .../tika/parser/chm/lzx/ChmBlockInfo.java       |  470 +--
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 1826 +++++------
 .../apache/tika/parser/chm/lzx/ChmLzxState.java |  654 ++--
 .../apache/tika/parser/chm/lzx/ChmSection.java  |  444 +--
 .../org/apache/tika/parser/mbox/MboxParser.java |  418 +--
 .../tika/parser/mbox/OutlookPSTParser.java      |  406 +--
 .../parser/odf/NSNormalizerContentHandler.java  |  198 +-
 .../parser/odf/OpenDocumentContentParser.java   |  992 +++---
 .../tika/parser/odf/OpenDocumentMetaParser.java |  398 +--
 .../tika/parser/odf/OpenDocumentParser.java     |  450 +--
 .../org/apache/tika/parser/opc/OPCDetector.java |  310 +-
 .../parser/opendocument/OpenOfficeParser.java   |   56 +-
 .../org/apache/tika/parser/rtf/GroupState.java  |  134 +-
 .../apache/tika/parser/rtf/ListDescriptor.java  |   70 +-
 .../org/apache/tika/parser/rtf/RTFParser.java   |  186 +-
 .../apache/tika/parser/rtf/TextExtractor.java   | 2846 +++++++++---------
 .../tika/parser/chm/TestChmBlockInfo.java       |  250 +-
 .../tika/parser/chm/TestChmExtraction.java      |  424 +--
 .../tika/parser/chm/TestChmExtractor.java       |  126 +-
 .../tika/parser/chm/TestChmItsfHeader.java      |  244 +-
 .../tika/parser/chm/TestChmItspHeader.java      |  320 +-
 .../apache/tika/parser/chm/TestChmLzxState.java |  202 +-
 .../tika/parser/chm/TestChmLzxcControlData.java |  288 +-
 .../tika/parser/chm/TestChmLzxcResetTable.java  |  312 +-
 .../parser/chm/TestDirectoryListingEntry.java   |  170 +-
 .../apache/tika/parser/chm/TestParameters.java  |  208 +-
 .../apache/tika/parser/chm/TestPmgiHeader.java  |   90 +-
 .../apache/tika/parser/chm/TestPmglHeader.java  |  152 +-
 .../apache/tika/parser/mbox/MboxParserTest.java |  312 +-
 .../tika/parser/mbox/OutlookPSTParserTest.java  |  220 +-
 .../AbstractPOIContainerExtractionTest.java     |  150 +-
 .../tika/parser/microsoft/ExcelParserTest.java  |  824 ++---
 .../tika/parser/microsoft/OfficeParserTest.java |   92 +-
 .../parser/microsoft/OutlookParserTest.java     |  478 +--
 .../microsoft/POIContainerExtractionTest.java   |  764 ++---
 .../parser/microsoft/PowerPointParserTest.java  |  502 +--
 .../parser/microsoft/PublisherParserTest.java   |  106 +-
 .../tika/parser/microsoft/TNEFParserTest.java   |  196 +-
 .../tika/parser/microsoft/VisioParserTest.java  |  102 +-
 .../tika/parser/microsoft/WordParserTest.java   | 1012 +++----
 .../apache/tika/parser/odf/ODFParserTest.java   |  680 ++---
 .../apache/tika/parser/rtf/RTFParserTest.java   | 1020 +++----
 .../tika-parser-package-module/pom.xml          |  150 +-
 .../tika/module/pkg/internal/Activator.java     |   72 +-
 .../tika/parser/iwork/AutoPageNumberUtils.java  |  224 +-
 .../tika/parser/iwork/IWorkPackageParser.java   |  438 +--
 .../parser/iwork/KeynoteContentHandler.java     |  348 +--
 .../parser/iwork/NumbersContentHandler.java     |  462 +--
 .../tika/parser/iwork/PagesContentHandler.java  |  896 +++---
 .../apache/tika/parser/pkg/PackageParser.java   |  574 ++--
 .../tika/parser/pkg/ZipContainerDetector.java   |  648 ++--
 .../parser/iwork/AutoPageNumberUtilsTest.java   |  156 +-
 .../tika/parser/iwork/IWorkParserTest.java      |  932 +++---
 .../apache/tika/parser/pkg/AbstractPkgTest.java |  186 +-
 .../apache/tika/parser/pkg/Bzip2ParserTest.java |  178 +-
 .../apache/tika/parser/pkg/GzipParserTest.java  |  204 +-
 .../apache/tika/parser/pkg/TarParserTest.java   |  210 +-
 .../apache/tika/parser/pkg/ZipParserTest.java   |  384 +--
 .../tika-parser-pdf-module/pom.xml              |  250 +-
 .../tika/module/pdf/internal/Activator.java     |   72 +-
 .../tika-parser-scientific-module/pom.xml       |  270 +-
 .../module/scientific/internal/Activator.java   |   72 +-
 .../org/apache/tika/parser/hdf/HDFParser.java   |  244 +-
 .../apache/tika/parser/hdf/HDFParserTest.java   |  144 +-
 .../tika/parser/netcdf/NetCDFParserTest.java    |  122 +-
 .../tika-parser-text-module/pom.xml             |  132 +-
 .../tika/module/text/internal/Activator.java    |   40 +-
 .../apache/tika/parser/txt/CharsetDetector.java | 1088 +++----
 .../apache/tika/parser/txt/CharsetMatch.java    |  572 ++--
 .../tika/parser/txt/CharsetRecog_2022.java      |  326 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  198 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  278 +-
 .../tika/parser/txt/CharsetRecog_mbcs.java      | 1064 +++----
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 2706 ++++++++---------
 .../tika/parser/txt/CharsetRecognizer.java      |  108 +-
 .../org/apache/tika/parser/txt/TXTParser.java   |  196 +-
 .../parser/xml/AbstractMetadataHandler.java     |  186 +-
 .../xml/AttributeDependantMetadataHandler.java  |  164 +-
 .../parser/xml/AttributeMetadataHandler.java    |  122 +-
 .../org/apache/tika/parser/xml/DcXMLParser.java |  120 +-
 .../tika/parser/xml/ElementMetadataHandler.java |  510 ++--
 .../tika/parser/xml/FictionBookParser.java      |  234 +-
 .../apache/tika/parser/xml/MetadataHandler.java |  170 +-
 .../org/apache/tika/parser/xml/XMLParser.java   |  178 +-
 .../apache/tika/parser/txt/TXTParserTest.java   |  548 ++--
 .../apache/tika/parser/xml/DcXMLParserTest.java |  174 +-
 .../EmptyAndDuplicateElementsXMLParserTest.java |  232 +-
 .../tika/parser/xml/FictionBookParserTest.java  |  108 +-
 .../tika-parser-web-module/pom.xml              |  178 +-
 .../tika/module/web/internal/Activator.java     |   72 +-
 .../org/apache/tika/parser/feed/FeedParser.java |  254 +-
 .../parser/html/BoilerpipeContentHandler.java   |  694 ++---
 .../tika/parser/html/DefaultHtmlMapper.java     |  274 +-
 .../apache/tika/parser/html/HtmlHandler.java    |  618 ++--
 .../org/apache/tika/parser/html/HtmlMapper.java |  138 +-
 .../org/apache/tika/parser/html/HtmlParser.java |  388 +--
 .../tika/parser/html/IdentityHtmlMapper.java    |   86 +-
 .../tika/parser/html/XHTMLDowngradeHandler.java |  156 +-
 .../tika/parser/mail/MailContentHandler.java    |  752 ++---
 .../apache/tika/parser/mail/RFC822Parser.java   |  190 +-
 .../apache/tika/parser/feed/FeedParserTest.java |  150 +-
 .../apache/tika/parser/html/HtmlParserTest.java | 2262 +++++++-------
 .../tika/parser/mail/RFC822ParserTest.java      |  970 +++---
 213 files changed, 35548 insertions(+), 35548 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
index 86f74a7..1f7c4a0 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
@@ -1,32 +1,32 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.concurrent;
-
-import java.util.concurrent.ExecutorService;
-
-/**
- * Allows Thread Pool to be Configurable.
- *
- * @since Apache Tika 1.11
- */
-public interface ConfigurableThreadPoolExecutor extends ExecutorService {
-    
-    public void setMaximumPoolSize(int threads);
-    
-    public void setCorePoolSize(int threads);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.concurrent;
+
+import java.util.concurrent.ExecutorService;
+
+/**
+ * Allows Thread Pool to be Configurable.
+ *
+ * @since Apache Tika 1.11
+ */
+public interface ConfigurableThreadPoolExecutor extends ExecutorService {
+    
+    public void setMaximumPoolSize(int threads);
+    
+    public void setCorePoolSize(int threads);
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
index a7e443f..0a18e94 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
@@ -1,40 +1,40 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.concurrent;
-
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ThreadFactory;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-/**
- * Simple Thread Pool Executor
- *
- * @since Apache Tika 1.11
- */
-public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
-
-    public SimpleThreadPoolExecutor() {
-        super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactory() {
-            
-            @Override
-            public Thread newThread(Runnable r) {
-                return new Thread(r, "Tika Executor Thread");
-            }
-        });
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.concurrent;
+
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Simple Thread Pool Executor
+ *
+ * @since Apache Tika 1.11
+ */
+public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
+
+    public SimpleThreadPoolExecutor() {
+        super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactory() {
+            
+            @Override
+            public Thread newThread(Runnable r) {
+                return new Thread(r, "Tika Executor Thread");
+            }
+        });
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java b/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
index f0d6129..952a089 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-/**
- * Abstract base class for new detectors. This class has a convenience method for
- * creating a DetectorProxy
- *
- * @since Apache Tika 2.0
- */
-public abstract class AbstractDetector implements Detector {
-    
-    /**
-     * Serial version UID.
-     */
-    private static final long serialVersionUID = -5869078281784941763L;
-
-    /**
-     * Convenience method for creating DetectorProxy instances
-     * with the current class' ClassLoader
-     * 
-     * @param detectorClassName
-     * @return
-     */
-    public Detector createDetectorProxy(String detectorClassName){
-        return new DetectorProxy(detectorClassName, getClass().getClassLoader());
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Abstract base class for new detectors. This class has a convenience method for
+ * creating a DetectorProxy
+ *
+ * @since Apache Tika 2.0
+ */
+public abstract class AbstractDetector implements Detector {
+    
+    /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = -5869078281784941763L;
+
+    /**
+     * Convenience method for creating DetectorProxy instances
+     * with the current class' ClassLoader
+     * 
+     * @param detectorClassName
+     * @return
+     */
+    public Detector createDetectorProxy(String detectorClassName){
+        return new DetectorProxy(detectorClassName, getClass().getClassLoader());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java b/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
index ed5e638..404ec0a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * This detector is a proxy for another detector 
- * this allows modules to use detectors from other modules
- * as optional dependencies since not including the classes
- * simply does nothing rather than throwing a ClassNotFoundException.
- *
- * @since Apache Tika 2.0
- */
-public class DetectorProxy implements Detector
-{
-    private static final long serialVersionUID = 4534101565629801667L;
-    
-    private Detector detector;
-    
-    public DetectorProxy(String detectorClassName, ClassLoader loader) 
-    {
-        this(detectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
-                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
-    }
-    
-    public DetectorProxy(String detectorClassName, ClassLoader loader, LoadErrorHandler handler) 
-    {
-        try 
-        {
-            this.detector = (Detector)Class.forName(detectorClassName, true, loader).newInstance();
-        } 
-        catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) 
-        {
-            handler.handleLoadError(detectorClassName, e);
-        }
-    }
-
-    @Override
-    public MediaType detect(InputStream input, Metadata metadata) throws IOException 
-    {
-        if(detector != null)
-        {
-            return detector.detect(input, metadata);
-        }
-        return null;
-    }
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * This detector is a proxy for another detector 
+ * this allows modules to use detectors from other modules
+ * as optional dependencies since not including the classes
+ * simply does nothing rather than throwing a ClassNotFoundException.
+ *
+ * @since Apache Tika 2.0
+ */
+public class DetectorProxy implements Detector
+{
+    private static final long serialVersionUID = 4534101565629801667L;
+    
+    private Detector detector;
+    
+    public DetectorProxy(String detectorClassName, ClassLoader loader) 
+    {
+        this(detectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
+                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+    }
+    
+    public DetectorProxy(String detectorClassName, ClassLoader loader, LoadErrorHandler handler) 
+    {
+        try 
+        {
+            this.detector = (Detector)Class.forName(detectorClassName, true, loader).newInstance();
+        } 
+        catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) 
+        {
+            handler.handleLoadError(detectorClassName, e);
+        }
+    }
+
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException 
+    {
+        if(detector != null)
+        {
+            return detector.detect(input, metadata);
+        }
+        return null;
+    }
+
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
index b927597..5e569bd 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
@@ -1,41 +1,41 @@
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.metadata.Metadata;
-
-public class EncodingDetectorProxy implements EncodingDetector {
-
-private EncodingDetector detector;
-    
-    public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader) 
-    {
-        this(encodingDetectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
-                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
-    }
-    
-    public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader, LoadErrorHandler handler) 
-    {
-        try 
-        {
-            this.detector = (EncodingDetector)Class.forName(encodingDetectorClassName, true, loader).newInstance();
-        } 
-        catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) 
-        {
-            handler.handleLoadError(encodingDetectorClassName, e);
-        }
-    }
-    
-    @Override
-    public Charset detect(InputStream input, Metadata metadata) throws IOException {
-        if(detector != null)
-        {
-            return detector.detect(input, metadata);
-        }
-        return null;
-    }
-
-}
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.metadata.Metadata;
+
+public class EncodingDetectorProxy implements EncodingDetector {
+
+private EncodingDetector detector;
+    
+    public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader) 
+    {
+        this(encodingDetectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
+                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+    }
+    
+    public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader, LoadErrorHandler handler) 
+    {
+        try 
+        {
+            this.detector = (EncodingDetector)Class.forName(encodingDetectorClassName, true, loader).newInstance();
+        } 
+        catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) 
+        {
+            handler.handleLoadError(encodingDetectorClassName, e);
+        }
+    }
+    
+    @Override
+    public Charset detect(InputStream input, Metadata metadata) throws IOException {
+        if(detector != null)
+        {
+            return detector.detect(input, metadata);
+        }
+        return null;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/StringUtil.java b/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
index 164765a..8876a0d 100644
--- a/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
@@ -1,121 +1,121 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.io;
-
-import java.nio.charset.Charset;
-
-/**
- * General String Related Utilities.
- * <p>
- * This class provides static utility methods for string operations
- * <p>
- * Origin of code: Based on the version in POI
- */
-public class StringUtil {
-    
-    protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
-    protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
-    public static final Charset UTF8 = Charset.forName("UTF-8");
-    
-    private StringUtil() {
-        // no instances of this class
-    }
-
-    /**
-     *  Given a byte array of 16-bit unicode characters in Little Endian
-     *  format (most important byte last), return a Java String representation
-     *  of it.
-     *
-     * { 0x16, 0x00 } -0x16
-     *
-     * @param  string  the byte array to be converted
-     * @param  offset  the initial offset into the
-     *                 byte array. it is assumed that string[ offset ] and string[ offset +
-     *                 1 ] contain the first 16-bit unicode character
-     * @param len the length of the final string
-     * @return the converted string, never <code>null</code>.
-     * @exception  ArrayIndexOutOfBoundsException  if offset is out of bounds for
-     *      the byte array (i.e., is negative or is greater than or equal to
-     *      string.length)
-     * @exception  IllegalArgumentException        if len is too large (i.e.,
-     *      there is not enough data in string to create a String of that
-     *      length)
-     */
-    public static String getFromUnicodeLE(
-            final byte[] string,
-            final int offset,
-            final int len)
-            throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
-        if ((offset < 0) || (offset >= string.length)) {
-            throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
-        }
-        if ((len < 0) || (((string.length - offset) / 2) < len)) {
-            throw new IllegalArgumentException("Illegal length " + len);
-        }
-
-        return new String(string, offset, len * 2, UTF16LE);
-    }
-    
-    /**
-     *  Given a byte array of 16-bit unicode characters in little endian
-     *  format (most important byte last), return a Java String representation
-     *  of it.
-     *
-     * { 0x16, 0x00 } -0x16
-     *
-     * @param  string  the byte array to be converted
-     * @return the converted string, never <code>null</code>
-     */
-    public static String getFromUnicodeLE(byte[] string) {
-        if(string.length == 0) { return ""; }
-        return getFromUnicodeLE(string, 0, string.length / 2);
-    }
-    
-    /**
-     * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
-     * String and return.
-     * (In Excel terms, read compressed 8 bit unicode as a string)
-     *
-     * @param string byte array to read
-     * @param offset offset to read byte array
-     * @param len    length to read byte array
-     * @return String generated String instance by reading byte array
-     */
-    public static String getFromCompressedUnicode(
-            final byte[] string,
-            final int offset,
-            final int len) {
-        int len_to_use = Math.min(len, string.length - offset);
-        return new String(string, offset, len_to_use, ISO_8859_1);
-    }
-    
-    /**
-     * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
-     * codepage).
-     * (In Excel terms, write compressed 8 bit unicode)
-     *
-     * @param  input   the String containing the data to be written
-     * @param  output  the byte array to which the data is to be written
-     * @param  offset  an offset into the byte arrat at which the data is start
-     *      when written
-     */
-    public static void putCompressedUnicode(String input, byte[] output, int offset) {
-        byte[] bytes = input.getBytes(ISO_8859_1);
-        System.arraycopy(bytes, 0, output, offset, bytes.length);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.nio.charset.Charset;
+
+/**
+ * General String Related Utilities.
+ * <p>
+ * This class provides static utility methods for string operations
+ * <p>
+ * Origin of code: Based on the version in POI
+ */
+public class StringUtil {
+    
+    protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
+    protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
+    public static final Charset UTF8 = Charset.forName("UTF-8");
+    
+    private StringUtil() {
+        // no instances of this class
+    }
+
+    /**
+     *  Given a byte array of 16-bit unicode characters in Little Endian
+     *  format (most important byte last), return a Java String representation
+     *  of it.
+     *
+     * { 0x16, 0x00 } -0x16
+     *
+     * @param  string  the byte array to be converted
+     * @param  offset  the initial offset into the
+     *                 byte array. it is assumed that string[ offset ] and string[ offset +
+     *                 1 ] contain the first 16-bit unicode character
+     * @param len the length of the final string
+     * @return the converted string, never <code>null</code>.
+     * @exception  ArrayIndexOutOfBoundsException  if offset is out of bounds for
+     *      the byte array (i.e., is negative or is greater than or equal to
+     *      string.length)
+     * @exception  IllegalArgumentException        if len is too large (i.e.,
+     *      there is not enough data in string to create a String of that
+     *      length)
+     */
+    public static String getFromUnicodeLE(
+            final byte[] string,
+            final int offset,
+            final int len)
+            throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
+        if ((offset < 0) || (offset >= string.length)) {
+            throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
+        }
+        if ((len < 0) || (((string.length - offset) / 2) < len)) {
+            throw new IllegalArgumentException("Illegal length " + len);
+        }
+
+        return new String(string, offset, len * 2, UTF16LE);
+    }
+    
+    /**
+     *  Given a byte array of 16-bit unicode characters in little endian
+     *  format (most important byte last), return a Java String representation
+     *  of it.
+     *
+     * { 0x16, 0x00 } -0x16
+     *
+     * @param  string  the byte array to be converted
+     * @return the converted string, never <code>null</code>
+     */
+    public static String getFromUnicodeLE(byte[] string) {
+        if(string.length == 0) { return ""; }
+        return getFromUnicodeLE(string, 0, string.length / 2);
+    }
+    
+    /**
+     * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
+     * String and return.
+     * (In Excel terms, read compressed 8 bit unicode as a string)
+     *
+     * @param string byte array to read
+     * @param offset offset to read byte array
+     * @param len    length to read byte array
+     * @return String generated String instance by reading byte array
+     */
+    public static String getFromCompressedUnicode(
+            final byte[] string,
+            final int offset,
+            final int len) {
+        int len_to_use = Math.min(len, string.length - offset);
+        return new String(string, offset, len_to_use, ISO_8859_1);
+    }
+    
+    /**
+     * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
+     * codepage).
+     * (In Excel terms, write compressed 8 bit unicode)
+     *
+     * @param  input   the String containing the data to be written
+     * @param  output  the byte array to which the data is to be written
+     * @param  offset  an offset into the byte arrat at which the data is start
+     *      when written
+     */
+    public static void putCompressedUnicode(String input, byte[] output, int offset) {
+        byte[] bytes = input.getBytes(ISO_8859_1);
+        System.arraycopy(bytes, 0, output, offset, bytes.length);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java b/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
index b959147..52a43dc 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
@@ -1,71 +1,71 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi;
-
-import java.util.Dictionary;
-import java.util.Enumeration;
-import java.util.Locale;
-import java.util.Properties;
-import java.util.ServiceLoader;
-
-import org.apache.tika.parser.Parser;
-import org.osgi.framework.BundleActivator;
-import org.osgi.framework.BundleContext;
-import org.osgi.framework.Constants;
-
-public abstract class TikaAbstractBundleActivator implements BundleActivator {
-
-    Dictionary createServiceRankProperties(String configName, BundleContext context) {
-        Dictionary serviceProps = new Properties();
-        String serviceRank = context.getProperty(configName);
-        if (serviceRank != null) {
-            serviceProps.put(Constants.SERVICE_RANKING, Integer.parseInt(serviceRank));
-        }
-        return serviceProps;
-
-    }
-    
-    public void registerTikaParserServiceLoader(BundleContext context, ClassLoader loader)
-    {
-        ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class, loader);
-        for(Parser currentParser: serviceLoader)
-        {
-            registerTikaService(context, currentParser, null);
-        }
-    }
-
-    void registerTikaService(BundleContext context, Parser parserService,
-            Dictionary additionalServiceProperties) {
-        String parserFullyClassifiedName = parserService.getClass().getCanonicalName().toLowerCase(Locale.US);
-
-        String serviceRankingPropName = parserFullyClassifiedName + ".serviceRanking";
-
-        Dictionary serviceProperties = createServiceRankProperties(serviceRankingPropName, context);
-
-        if (additionalServiceProperties != null) {
-            Enumeration keys = additionalServiceProperties.keys();
-            while (keys.hasMoreElements()) {
-                String currentKey = (String) keys.nextElement();
-                serviceProperties.put(currentKey, additionalServiceProperties.get(currentKey));
-            }
-
-        }
-
-        context.registerService(Parser.class, parserService, serviceProperties);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi;
+
+import java.util.Dictionary;
+import java.util.Enumeration;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.ServiceLoader;
+
+import org.apache.tika.parser.Parser;
+import org.osgi.framework.BundleActivator;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.Constants;
+
+public abstract class TikaAbstractBundleActivator implements BundleActivator {
+
+    Dictionary createServiceRankProperties(String configName, BundleContext context) {
+        Dictionary serviceProps = new Properties();
+        String serviceRank = context.getProperty(configName);
+        if (serviceRank != null) {
+            serviceProps.put(Constants.SERVICE_RANKING, Integer.parseInt(serviceRank));
+        }
+        return serviceProps;
+
+    }
+    
+    public void registerTikaParserServiceLoader(BundleContext context, ClassLoader loader)
+    {
+        ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class, loader);
+        for(Parser currentParser: serviceLoader)
+        {
+            registerTikaService(context, currentParser, null);
+        }
+    }
+
+    void registerTikaService(BundleContext context, Parser parserService,
+            Dictionary additionalServiceProperties) {
+        String parserFullyClassifiedName = parserService.getClass().getCanonicalName().toLowerCase(Locale.US);
+
+        String serviceRankingPropName = parserFullyClassifiedName + ".serviceRanking";
+
+        Dictionary serviceProperties = createServiceRankProperties(serviceRankingPropName, context);
+
+        if (additionalServiceProperties != null) {
+            Enumeration keys = additionalServiceProperties.keys();
+            while (keys.hasMoreElements()) {
+                String currentKey = (String) keys.nextElement();
+                serviceProperties.put(currentKey, additionalServiceProperties.get(currentKey));
+            }
+
+        }
+
+        context.registerService(Parser.class, parserService, serviceProperties);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java b/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
index 4ada094..283ae1c 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
@@ -1,25 +1,25 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi;
-
-import org.apache.tika.detect.Detector;
-import org.apache.tika.language.translate.Translator;
-import org.apache.tika.parser.Parser;
-
-public interface TikaService extends Parser, Detector, Translator {
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.parser.Parser;
+
+public interface TikaService extends Parser, Detector, Translator {
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java b/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
index f3b6171..fefa1af 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
@@ -1,81 +1,81 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi.internal;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.osgi.TikaService;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class TikaServiceImpl implements TikaService {
-
-    private static final long serialVersionUID = 1L;
-    
-    private final Tika tika;
-
-    public TikaServiceImpl() {
-        this.tika = new Tika();
-    }
-    
-    public TikaServiceImpl(TikaConfig config)
-    {
-        this.tika = new Tika(config);
-    }
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return this.tika.getParser().getSupportedTypes(context);
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        tika.getParser().parse(stream, handler, metadata, context);
-
-    }
-
-    @Override
-    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
-        return tika.getDetector().detect(input, metadata);
-    }
-    
-    @Override
-    public String translate(String text, String sourceLanguage, String targetLanguage)
-            throws TikaException, IOException {
-        return tika.getTranslator().translate(text, sourceLanguage, targetLanguage);
-    }
-    
-    @Override
-    public String translate(String text, String targetLanguage) throws TikaException, IOException {
-        return tika.getTranslator().translate(text, targetLanguage);
-    }
-    
-    @Override
-    public boolean isAvailable() {
-        return tika.getTranslator().isAvailable();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi.internal;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.osgi.TikaService;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaServiceImpl implements TikaService {
+
+    private static final long serialVersionUID = 1L;
+    
+    private final Tika tika;
+
+    public TikaServiceImpl() {
+        this.tika = new Tika();
+    }
+    
+    public TikaServiceImpl(TikaConfig config)
+    {
+        this.tika = new Tika(config);
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return this.tika.getParser().getSupportedTypes(context);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        tika.getParser().parse(stream, handler, metadata, context);
+
+    }
+
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        return tika.getDetector().detect(input, metadata);
+    }
+    
+    @Override
+    public String translate(String text, String sourceLanguage, String targetLanguage)
+            throws TikaException, IOException {
+        return tika.getTranslator().translate(text, sourceLanguage, targetLanguage);
+    }
+    
+    @Override
+    public String translate(String text, String targetLanguage) throws TikaException, IOException {
+        return tika.getTranslator().translate(text, targetLanguage);
+    }
+    
+    @Override
+    public boolean isAvailable() {
+        return tika.getTranslator().isAvailable();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java b/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
index 9f363f6..8c99d17 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
@@ -1,74 +1,74 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * This parser is a proxy for another detector this allows modules to use
- * parsers from other modules as optional dependencies since not including the
- * classes simply does nothing rather than throwing a ClassNotFoundException.
- *
- * @since Apache Tika 2.0
- */
-public class ParserProxy extends AbstractParser {
-
-    private static final long serialVersionUID = -4838436708916910179L;
-    private Parser parser;
-
-    public ParserProxy(String parserClassName, ClassLoader loader) {
-        
-        this(parserClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
-                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
-    }
-
-    public ParserProxy(String parserClassName, ClassLoader loader, LoadErrorHandler handler) {
-        try {
-            this.parser = (Parser) Class.forName(parserClassName, true, loader).newInstance();
-        } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
-            handler.handleLoadError(parserClassName, e);
-        }
-
-    }
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        if (parser == null) {
-            return Collections.emptySet();
-        }
-        return parser.getSupportedTypes(context);
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        if (parser != null) {
-            parser.parse(stream, handler, metadata, context);
-        }
-        // Otherwise do nothing
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This parser is a proxy for another detector this allows modules to use
+ * parsers from other modules as optional dependencies since not including the
+ * classes simply does nothing rather than throwing a ClassNotFoundException.
+ *
+ * @since Apache Tika 2.0
+ */
+public class ParserProxy extends AbstractParser {
+
+    private static final long serialVersionUID = -4838436708916910179L;
+    private Parser parser;
+
+    public ParserProxy(String parserClassName, ClassLoader loader) {
+        
+        this(parserClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn") 
+                ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+    }
+
+    public ParserProxy(String parserClassName, ClassLoader loader, LoadErrorHandler handler) {
+        try {
+            this.parser = (Parser) Class.forName(parserClassName, true, loader).newInstance();
+        } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
+            handler.handleLoadError(parserClassName, e);
+        }
+
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        if (parser == null) {
+            return Collections.emptySet();
+        }
+        return parser.getSupportedTypes(context);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (parser != null) {
+            parser.parse(stream, handler, metadata, context);
+        }
+        // Otherwise do nothing
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
index 5f4cd13..a47f747 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
@@ -1,57 +1,57 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.utils;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.FutureTask;
-
-import org.apache.tika.parser.ParseContext;
-
-/**
- * Utility Class for Concurrency in Tika
- *
- * @since Apache Tika 1.11
- */
-public class ConcurrentUtils {
-    
-    /**
-     * 
-     * Execute a runnable using an ExecutorService from the ParseContext if possible.
-     * Otherwise fallback to individual threads.
-     * 
-     * @param context
-     * @param runnable
-     * @return
-     */
-    public static Future execute(ParseContext context, Runnable runnable) {
-        
-        Future future = null;
-        ExecutorService executorService = context.get(ExecutorService.class);
-        if(executorService == null) {
-            FutureTask task = new FutureTask<>(runnable, null);
-            Thread thread = new Thread(task, "Tika Thread");
-            thread.start();
-            future = task;
-        }
-        else {
-            future = executorService.submit(runnable);
-        }
-        
-        return future;
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.FutureTask;
+
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Utility Class for Concurrency in Tika
+ *
+ * @since Apache Tika 1.11
+ */
+public class ConcurrentUtils {
+    
+    /**
+     * 
+     * Execute a runnable using an ExecutorService from the ParseContext if possible.
+     * Otherwise fallback to individual threads.
+     * 
+     * @param context
+     * @param runnable
+     * @return
+     */
+    public static Future execute(ParseContext context, Runnable runnable) {
+        
+        Future future = null;
+        ExecutorService executorService = context.get(ExecutorService.class);
+        if(executorService == null) {
+            FutureTask task = new FutureTask<>(runnable, null);
+            Thread thread = new Thread(task, "Tika Thread");
+            thread.start();
+            future = task;
+        }
+        else {
+            future = executorService.submit(runnable);
+        }
+        
+        return future;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
index 849eda3..5da9d0d 100644
--- a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
+++ b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
@@ -1,30 +1,30 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
-
-class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
-    public DummyExecutor() 
-    {
-        super(1,1, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
+
+class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
+    public DummyExecutor() 
+    {
+        super(1,1, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java b/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
index 060f3d9..33683a9 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import static org.junit.Assert.*;
-
-import java.io.IOException;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.mime.MediaType;
-import org.junit.Test;
-
-public class DetectorProxyTest 
-{
-    @Test
-    public void testDetectorProxyExists() throws IOException 
-    {
-        Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DummyProxyDetector", 
-                getClass().getClassLoader(),
-                LoadErrorHandler.IGNORE);
-        
-        MediaType result = dummyDetector.detect(null, null);
-        
-        assertEquals("Detector being proxied exists so result should not be null", 
-                MediaType.TEXT_PLAIN, result );
-        
-    }
-    
-    @Test
-    public void testParserProxyNotExists() throws IOException 
-    {
-        Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DoesNotExist",
-                getClass().getClassLoader(),
-                LoadErrorHandler.IGNORE);
-        
-        MediaType result = dummyDetector.detect(null, null);
-        
-        assertNull("Detector being proxied does not exists so result should be null", result );
-        
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+public class DetectorProxyTest 
+{
+    @Test
+    public void testDetectorProxyExists() throws IOException 
+    {
+        Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DummyProxyDetector", 
+                getClass().getClassLoader(),
+                LoadErrorHandler.IGNORE);
+        
+        MediaType result = dummyDetector.detect(null, null);
+        
+        assertEquals("Detector being proxied exists so result should not be null", 
+                MediaType.TEXT_PLAIN, result );
+        
+    }
+    
+    @Test
+    public void testParserProxyNotExists() throws IOException 
+    {
+        Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DoesNotExist",
+                getClass().getClassLoader(),
+                LoadErrorHandler.IGNORE);
+        
+        MediaType result = dummyDetector.detect(null, null);
+        
+        assertNull("Detector being proxied does not exists so result should be null", result );
+        
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java b/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
index a11b584..ce1207a 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
@@ -1,31 +1,31 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-public class DummyProxyDetector implements Detector
-{
-    @Override
-    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
-        return MediaType.TEXT_PLAIN;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class DummyProxyDetector implements Detector
+{
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        return MediaType.TEXT_PLAIN;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
index ca766c9..4ae7898 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
@@ -1,44 +1,44 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class DummyProxyParser extends AbstractParser 
-{
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) 
-    {
-        return null;
-    }
-    
-    @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException 
-    {
-        metadata.add("Test", "value");
-        
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DummyProxyParser extends AbstractParser 
+{
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) 
+    {
+        return null;
+    }
+    
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException 
+    {
+        metadata.add("Test", "value");
+        
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
index 20c6247..9f57965 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
@@ -1,65 +1,65 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.Assert.*;
-
-import java.io.IOException;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-import org.xml.sax.SAXException;
-
-public class ParserProxyTest 
-{
-
-    @Test
-    public void testParserProxyExists() throws IOException, SAXException, TikaException 
-    {
-        Parser dummyParser = new ParserProxy("org.apache.tika.parser.DummyProxyParser",
-                getClass().getClassLoader(),
-                LoadErrorHandler.IGNORE);
-        
-        Metadata metadata = new Metadata();
-        
-        dummyParser.parse(null, null, metadata, null);
-        
-        assertEquals("Parser being proxied exists so metadata should be added", 
-                1, metadata.size());
-        
-    }
-    
-    @Test
-    public void testParserProxyNotExists() throws IOException, SAXException, TikaException 
-    {
-        Parser dummyParser = new ParserProxy("org.apache.tika.parser.NotExists",
-                getClass().getClassLoader(),
-                LoadErrorHandler.IGNORE);
-        
-        Metadata metadata = new Metadata();
-        
-        dummyParser.parse(null, null, metadata, null);
-        
-        assertEquals("Parser being proxied doesn't exist so metadata not change", 
-                0, metadata.size());
-        
-    }
-    
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class ParserProxyTest 
+{
+
+    @Test
+    public void testParserProxyExists() throws IOException, SAXException, TikaException 
+    {
+        Parser dummyParser = new ParserProxy("org.apache.tika.parser.DummyProxyParser",
+                getClass().getClassLoader(),
+                LoadErrorHandler.IGNORE);
+        
+        Metadata metadata = new Metadata();
+        
+        dummyParser.parse(null, null, metadata, null);
+        
+        assertEquals("Parser being proxied exists so metadata should be added", 
+                1, metadata.size());
+        
+    }
+    
+    @Test
+    public void testParserProxyNotExists() throws IOException, SAXException, TikaException 
+    {
+        Parser dummyParser = new ParserProxy("org.apache.tika.parser.NotExists",
+                getClass().getClassLoader(),
+                LoadErrorHandler.IGNORE);
+        
+        Metadata metadata = new Metadata();
+        
+        dummyParser.parse(null, null, metadata, null);
+        
+        assertEquals("Parser being proxied doesn't exist so metadata not change", 
+                0, metadata.size());
+        
+    }
+    
+
+}

[08/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index 869facf..be6455f 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,139 +1,139 @@
-/*
- *******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- *******************************************************************************
- *
- */
-package org.apache.tika.parser.txt;
-
-/**
- * This class matches UTF-16 and UTF-32, both big- and little-endian. The
- * BOM will be used if it is present.
- *
- * @internal
- */
-abstract class CharsetRecog_Unicode extends CharsetRecognizer {
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#getName()
-     */
-    abstract String getName();
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
-     */
-    abstract int match(CharsetDetector det);
-
-    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
-        String getName() {
-            return "UTF-16BE";
-        }
-
-        int match(CharsetDetector det) {
-            byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
-                return 100;
-            }
-
-            // TODO: Do some statistics to check for unsigned UTF-16BE
-            return 0;
-        }
-    }
-
-    static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
-        String getName() {
-            return "UTF-16LE";
-        }
-
-        int match(CharsetDetector det) {
-            byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
-                // An LE BOM is present.
-                if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
-                    // It is probably UTF-32 LE, not UTF-16
-                    return 0;
-                }
-                return 100;
-            }
-
-            // TODO: Do some statistics to check for unsigned UTF-16LE
-            return 0;
-        }
-    }
-
-    static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode {
-        abstract int getChar(byte[] input, int index);
-
-        abstract String getName();
-
-        int match(CharsetDetector det) {
-            byte[] input = det.fRawInput;
-            int limit = (det.fRawLength / 4) * 4;
-            int numValid = 0;
-            int numInvalid = 0;
-            boolean hasBOM = false;
-            int confidence = 0;
-
-            if (limit == 0) {
-                return 0;
-            }
-            if (getChar(input, 0) == 0x0000FEFF) {
-                hasBOM = true;
-            }
-
-            for (int i = 0; i < limit; i += 4) {
-                int ch = getChar(input, i);
-
-                if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
-                    numInvalid += 1;
-                } else {
-                    numValid += 1;
-                }
-            }
-
-
-            // Cook up some sort of confidence score, based on presence of a BOM
-            //    and the existence of valid and/or invalid multi-byte sequences.
-            if (hasBOM && numInvalid == 0) {
-                confidence = 100;
-            } else if (hasBOM && numValid > numInvalid * 10) {
-                confidence = 80;
-            } else if (numValid > 3 && numInvalid == 0) {
-                confidence = 100;
-            } else if (numValid > 0 && numInvalid == 0) {
-                confidence = 80;
-            } else if (numValid > numInvalid * 10) {
-                // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
-                confidence = 25;
-            }
-
-            return confidence;
-        }
-    }
-
-    static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
-        int getChar(byte[] input, int index) {
-            return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
-                    (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
-        }
-
-        String getName() {
-            return "UTF-32BE";
-        }
-    }
-
-
-    static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
-        int getChar(byte[] input, int index) {
-            return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
-                    (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
-        }
-
-        String getName() {
-            return "UTF-32LE";
-        }
-    }
-}
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2007, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * This class matches UTF-16 and UTF-32, both big- and little-endian. The
+ * BOM will be used if it is present.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_Unicode extends CharsetRecognizer {
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.CharsetRecognizer#getName()
+     */
+    abstract String getName();
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+     */
+    abstract int match(CharsetDetector det);
+
+    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
+        String getName() {
+            return "UTF-16BE";
+        }
+
+        int match(CharsetDetector det) {
+            byte[] input = det.fRawInput;
+
+            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
+                return 100;
+            }
+
+            // TODO: Do some statistics to check for unsigned UTF-16BE
+            return 0;
+        }
+    }
+
+    static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
+        String getName() {
+            return "UTF-16LE";
+        }
+
+        int match(CharsetDetector det) {
+            byte[] input = det.fRawInput;
+
+            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
+                // An LE BOM is present.
+                if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
+                    // It is probably UTF-32 LE, not UTF-16
+                    return 0;
+                }
+                return 100;
+            }
+
+            // TODO: Do some statistics to check for unsigned UTF-16LE
+            return 0;
+        }
+    }
+
+    static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode {
+        abstract int getChar(byte[] input, int index);
+
+        abstract String getName();
+
+        int match(CharsetDetector det) {
+            byte[] input = det.fRawInput;
+            int limit = (det.fRawLength / 4) * 4;
+            int numValid = 0;
+            int numInvalid = 0;
+            boolean hasBOM = false;
+            int confidence = 0;
+
+            if (limit == 0) {
+                return 0;
+            }
+            if (getChar(input, 0) == 0x0000FEFF) {
+                hasBOM = true;
+            }
+
+            for (int i = 0; i < limit; i += 4) {
+                int ch = getChar(input, i);
+
+                if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
+                    numInvalid += 1;
+                } else {
+                    numValid += 1;
+                }
+            }
+
+
+            // Cook up some sort of confidence score, based on presence of a BOM
+            //    and the existence of valid and/or invalid multi-byte sequences.
+            if (hasBOM && numInvalid == 0) {
+                confidence = 100;
+            } else if (hasBOM && numValid > numInvalid * 10) {
+                confidence = 80;
+            } else if (numValid > 3 && numInvalid == 0) {
+                confidence = 100;
+            } else if (numValid > 0 && numInvalid == 0) {
+                confidence = 80;
+            } else if (numValid > numInvalid * 10) {
+                // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
+                confidence = 25;
+            }
+
+            return confidence;
+        }
+    }
+
+    static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
+        int getChar(byte[] input, int index) {
+            return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
+                    (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
+        }
+
+        String getName() {
+            return "UTF-32BE";
+        }
+    }
+
+
+    static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
+        int getChar(byte[] input, int index) {
+            return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
+                    (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
+        }
+
+        String getName() {
+            return "UTF-32LE";
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 1c63f9e..35d2b4f 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,532 +1,532 @@
-/*
- ****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
- * others. All Rights Reserved.                                             *
- ****************************************************************************
- *
- */
-package org.apache.tika.parser.txt;
-
-import java.util.Arrays;
-
-/**
- * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
- * Match is determined mostly by the input data adhering to the
- * encoding scheme for the charset, and, optionally,
- * frequency-of-occurence of characters.
- * <p/>
- * Instances of this class are singletons, one per encoding
- * being recognized.  They are created in the main
- * CharsetDetector class and kept in the global list of available
- * encodings to be checked.  The specific encoding being recognized
- * is determined by subclass.
- *
- * @internal
- */
-abstract class CharsetRecog_mbcs extends CharsetRecognizer {
-
-    /**
-     * Get the IANA name of this charset.
-     *
-     * @return the charset name.
-     */
-    abstract String getName();
-
-
-    /**
-     * Test the match of this charset with the input text data
-     * which is obtained via the CharsetDetector object.
-     *
-     * @param det The CharsetDetector, which contains the input text
-     *            to be checked for being in this charset.
-     * @return Two values packed into one int  (Damn java, anyhow)
-     * <br/>
-     * bits 0-7:  the match confidence, ranging from 0-100
-     * <br/>
-     * bits 8-15: The match reason, an enum-like value.
-     */
-    int match(CharsetDetector det, int[] commonChars) {
-        int singleByteCharCount = 0;
-        int doubleByteCharCount = 0;
-        int commonCharCount = 0;
-        int badCharCount = 0;
-        int totalCharCount = 0;
-        int confidence = 0;
-        iteratedChar iter = new iteratedChar();
-
-        detectBlock:
-        {
-            for (iter.reset(); nextChar(iter, det); ) {
-                totalCharCount++;
-                if (iter.error) {
-                    badCharCount++;
-                } else {
-                    long cv = iter.charValue & 0xFFFFFFFFL;
-
-                    if (cv <= 0xff) {
-                        singleByteCharCount++;
-                    } else {
-                        doubleByteCharCount++;
-                        if (commonChars != null) {
-                            // NOTE: This assumes that there are no 4-byte common chars.
-                            if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
-                                commonCharCount++;
-                            }
-                        }
-                    }
-                }
-                if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
-                    // Bail out early if the byte data is not matching the encoding scheme.
-                    break detectBlock;
-                }
-            }
-
-            if (doubleByteCharCount <= 10 && badCharCount == 0) {
-                // Not many multi-byte chars.
-                if (doubleByteCharCount == 0 && totalCharCount < 10) {
-                    // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
-                    // We don't have enough data to have any confidence.
-                    // Statistical analysis of single byte non-ASCII charcters would probably help here.
-                    confidence = 0;
-                } else {
-                    //   ASCII or ISO file?  It's probably not our encoding,
-                    //   but is not incompatible with our encoding, so don't give it a zero.
-                    confidence = 10;
-                }
-
-                break detectBlock;
-            }
-
-            //
-            //  No match if there are too many characters that don't fit the encoding scheme.
-            //    (should we have zero tolerance for these?)
-            //
-            if (doubleByteCharCount < 20 * badCharCount) {
-                confidence = 0;
-                break detectBlock;
-            }
-
-            if (commonChars == null) {
-                // We have no statistics on frequently occuring characters.
-                //  Assess confidence purely on having a reasonable number of
-                //  multi-byte characters (the more the better
-                confidence = 30 + doubleByteCharCount - 20 * badCharCount;
-                if (confidence > 100) {
-                    confidence = 100;
-                }
-            } else {
-                //
-                // Frequency of occurence statistics exist.
-                //
-                double maxVal = Math.log((float) doubleByteCharCount / 4);
-                double scaleFactor = 90.0 / maxVal;
-                confidence = (int) (Math.log(commonCharCount + 1) * scaleFactor + 10);
-                confidence = Math.min(confidence, 100);
-            }
-        }   // end of detectBlock:
-
-        return confidence;
-    }
-
-    /**
-     * Get the next character (however many bytes it is) from the input data
-     * Subclasses for specific charset encodings must implement this function
-     * to get characters according to the rules of their encoding scheme.
-     * <p/>
-     * This function is not a method of class iteratedChar only because
-     * that would require a lot of extra derived classes, which is awkward.
-     *
-     * @param it  The iteratedChar "struct" into which the returned char is placed.
-     * @param det The charset detector, which is needed to get at the input byte data
-     *            being iterated over.
-     * @return True if a character was returned, false at end of input.
-     */
-    abstract boolean nextChar(iteratedChar it, CharsetDetector det);
-
-    // "Character"  iterated character class.
-    //    Recognizers for specific mbcs encodings make their "characters" available
-    //    by providing a nextChar() function that fills in an instance of iteratedChar
-    //    with the next char from the input.
-    //    The returned characters are not converted to Unicode, but remain as the raw
-    //    bytes (concatenated into an int) from the codepage data.
-    //
-    //  For Asian charsets, use the raw input rather than the input that has been
-    //   stripped of markup.  Detection only considers multi-byte chars, effectively
-    //   stripping markup anyway, and double byte chars do occur in markup too.
-    //
-    static class iteratedChar {
-        int charValue = 0;             // 1-4 bytes from the raw input data
-        int index = 0;
-        int nextIndex = 0;
-        boolean error = false;
-        boolean done = false;
-
-        void reset() {
-            charValue = 0;
-            index = -1;
-            nextIndex = 0;
-            error = false;
-            done = false;
-        }
-
-        int nextByte(CharsetDetector det) {
-            if (nextIndex >= det.fRawLength) {
-                done = true;
-                return -1;
-            }
-            int byteValue = (int) det.fRawInput[nextIndex++] & 0x00ff;
-            return byteValue;
-        }
-    }
-
-    /**
-     * Shift-JIS charset recognizer.
-     */
-    static class CharsetRecog_sjis extends CharsetRecog_mbcs {
-        static int[] commonChars =
-                // TODO:  This set of data comes from the character frequency-
-                //        of-occurence analysis tool.  The data needs to be moved
-                //        into a resource and loaded from there.
-                {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
-                        0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
-                        0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
-                        0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
-                        0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
-                        0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
-
-        boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
-            it.error = false;
-            int firstByte;
-            firstByte = it.charValue = it.nextByte(det);
-            if (firstByte < 0) {
-                return false;
-            }
-
-            if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) {
-                return true;
-            }
-
-            int secondByte = it.nextByte(det);
-            if (secondByte < 0) {
-                return false;
-            }
-            it.charValue = (firstByte << 8) | secondByte;
-            if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
-                // Illegal second byte value.
-                it.error = true;
-            }
-            return true;
-        }
-
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
-        }
-
-        String getName() {
-            return "Shift_JIS";
-        }
-
-        public String getLanguage() {
-            return "ja";
-        }
-
-
-    }
-
-
-    /**
-     * Big5 charset recognizer.
-     */
-    static class CharsetRecog_big5 extends CharsetRecog_mbcs {
-        static int[] commonChars =
-                // TODO:  This set of data comes from the character frequency-
-                //        of-occurence analysis tool.  The data needs to be moved
-                //        into a resource and loaded from there.
-                {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
-                        0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
-                        0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
-                        0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
-                        0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
-                        0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
-                        0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
-                        0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
-                        0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
-                        0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
-
-        boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
-            it.error = false;
-            int firstByte;
-            firstByte = it.charValue = it.nextByte(det);
-            if (firstByte < 0) {
-                return false;
-            }
-
-            if (firstByte <= 0x7f || firstByte == 0xff) {
-                // single byte character.
-                return true;
-            }
-
-            int secondByte = it.nextByte(det);
-            if (secondByte < 0) {
-                return false;
-            }
-            it.charValue = (it.charValue << 8) | secondByte;
-
-            if (secondByte < 0x40 ||
-                    secondByte == 0x7f ||
-                    secondByte == 0xff) {
-                it.error = true;
-            }
-            return true;
-        }
-
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
-        }
-
-        String getName() {
-            return "Big5";
-        }
-
-
-        public String getLanguage() {
-            return "zh";
-        }
-    }
-
-
-    /**
-     * EUC charset recognizers.  One abstract class that provides the common function
-     * for getting the next character according to the EUC encoding scheme,
-     * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
-     */
-    abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
-
-        /*
-         *  (non-Javadoc)
-         *  Get the next character value for EUC based encodings.
-         *  Character "value" is simply the raw bytes that make up the character
-         *     packed into an int.
-         */
-        boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
-            it.error = false;
-            int firstByte = 0;
-            int secondByte = 0;
-            int thirdByte = 0;
-            //int fourthByte = 0;
-
-            buildChar:
-            {
-                firstByte = it.charValue = it.nextByte(det);
-                if (firstByte < 0) {
-                    // Ran off the end of the input data
-                    it.done = true;
-                    break buildChar;
-                }
-                if (firstByte <= 0x8d) {
-                    // single byte char
-                    break buildChar;
-                }
-
-                secondByte = it.nextByte(det);
-                it.charValue = (it.charValue << 8) | secondByte;
-
-                if (firstByte >= 0xA1 && firstByte <= 0xfe) {
-                    // Two byte Char
-                    if (secondByte < 0xa1) {
-                        it.error = true;
-                    }
-                    break buildChar;
-                }
-                if (firstByte == 0x8e) {
-                    // Code Set 2.
-                    //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
-                    //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
-                    // We don't know which we've got.
-                    // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
-                    //   bytes will look like a well formed 2 byte char.
-                    if (secondByte < 0xa1) {
-                        it.error = true;
-                    }
-                    break buildChar;
-                }
-
-                if (firstByte == 0x8f) {
-                    // Code set 3.
-                    // Three byte total char size, two bytes of actual char value.
-                    thirdByte = it.nextByte(det);
-                    it.charValue = (it.charValue << 8) | thirdByte;
-                    if (thirdByte < 0xa1) {
-                        it.error = true;
-                    }
-                }
-            }
-
-            return (it.done == false);
-        }
-
-        /**
-         * The charset recognize for EUC-JP.  A singleton instance of this class
-         * is created and kept by the public CharsetDetector class
-         */
-        static class CharsetRecog_euc_jp extends CharsetRecog_euc {
-            static int[] commonChars =
-                    // TODO:  This set of data comes from the character frequency-
-                    //        of-occurence analysis tool.  The data needs to be moved
-                    //        into a resource and loaded from there.
-                    {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
-                            0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
-                            0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
-                            0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
-                            0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
-                            0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
-                            0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
-                            0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
-                            0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
-                            0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
-
-            String getName() {
-                return "EUC-JP";
-            }
-
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
-            }
-
-            public String getLanguage() {
-                return "ja";
-            }
-        }
-
-        /**
-         * The charset recognize for EUC-KR.  A singleton instance of this class
-         * is created and kept by the public CharsetDetector class
-         */
-        static class CharsetRecog_euc_kr extends CharsetRecog_euc {
-            static int[] commonChars =
-                    // TODO:  This set of data comes from the character frequency-
-                    //        of-occurence analysis tool.  The data needs to be moved
-                    //        into a resource and loaded from there.
-                    {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
-                            0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
-                            0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
-                            0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
-                            0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
-                            0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
-                            0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
-                            0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
-                            0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
-                            0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
-
-            String getName() {
-                return "EUC-KR";
-            }
-
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
-            }
-
-            public String getLanguage() {
-                return "ko";
-            }
-        }
-    }
-
-    /**
-     * GB-18030 recognizer. Uses simplified Chinese statistics.
-     */
-    static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
-
-        static int[] commonChars =
-                // TODO:  This set of data comes from the character frequency-
-                //        of-occurence analysis tool.  The data needs to be moved
-                //        into a resource and loaded from there.
-                {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
-                        0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
-                        0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
-                        0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
-                        0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
-                        0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
-                        0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
-                        0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
-                        0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
-                        0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
-
-        /*
-         *  (non-Javadoc)
-         *  Get the next character value for EUC based encodings.
-         *  Character "value" is simply the raw bytes that make up the character
-         *     packed into an int.
-         */
-        boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
-            it.error = false;
-            int firstByte = 0;
-            int secondByte = 0;
-            int thirdByte = 0;
-            int fourthByte = 0;
-
-            buildChar:
-            {
-                firstByte = it.charValue = it.nextByte(det);
-
-                if (firstByte < 0) {
-                    // Ran off the end of the input data
-                    it.done = true;
-                    break buildChar;
-                }
-
-                if (firstByte <= 0x80) {
-                    // single byte char
-                    break buildChar;
-                }
-
-                secondByte = it.nextByte(det);
-                it.charValue = (it.charValue << 8) | secondByte;
-
-                if (firstByte >= 0x81 && firstByte <= 0xFE) {
-                    // Two byte Char
-                    if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) {
-                        break buildChar;
-                    }
-
-                    // Four byte char
-                    if (secondByte >= 0x30 && secondByte <= 0x39) {
-                        thirdByte = it.nextByte(det);
-
-                        if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
-                            fourthByte = it.nextByte(det);
-
-                            if (fourthByte >= 0x30 && fourthByte <= 0x39) {
-                                it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
-                                break buildChar;
-                            }
-                        }
-                    }
-
-                    it.error = true;
-                    break buildChar;
-                }
-            }
-
-            return (it.done == false);
-        }
-
-        String getName() {
-            return "GB18030";
-        }
-
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
-        }
-
-        public String getLanguage() {
-            return "zh";
-        }
-    }
-
-
-}
+/*
+ ****************************************************************************
+ * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * others. All Rights Reserved.                                             *
+ ****************************************************************************
+ *
+ */
+package org.apache.tika.parser.txt;
+
+import java.util.Arrays;
+
+/**
+ * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
+ * Match is determined mostly by the input data adhering to the
+ * encoding scheme for the charset, and, optionally,
+ * frequency-of-occurence of characters.
+ * <p/>
+ * Instances of this class are singletons, one per encoding
+ * being recognized.  They are created in the main
+ * CharsetDetector class and kept in the global list of available
+ * encodings to be checked.  The specific encoding being recognized
+ * is determined by subclass.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_mbcs extends CharsetRecognizer {
+
+    /**
+     * Get the IANA name of this charset.
+     *
+     * @return the charset name.
+     */
+    abstract String getName();
+
+
+    /**
+     * Test the match of this charset with the input text data
+     * which is obtained via the CharsetDetector object.
+     *
+     * @param det The CharsetDetector, which contains the input text
+     *            to be checked for being in this charset.
+     * @return Two values packed into one int  (Damn java, anyhow)
+     * <br/>
+     * bits 0-7:  the match confidence, ranging from 0-100
+     * <br/>
+     * bits 8-15: The match reason, an enum-like value.
+     */
+    int match(CharsetDetector det, int[] commonChars) {
+        int singleByteCharCount = 0;
+        int doubleByteCharCount = 0;
+        int commonCharCount = 0;
+        int badCharCount = 0;
+        int totalCharCount = 0;
+        int confidence = 0;
+        iteratedChar iter = new iteratedChar();
+
+        detectBlock:
+        {
+            for (iter.reset(); nextChar(iter, det); ) {
+                totalCharCount++;
+                if (iter.error) {
+                    badCharCount++;
+                } else {
+                    long cv = iter.charValue & 0xFFFFFFFFL;
+
+                    if (cv <= 0xff) {
+                        singleByteCharCount++;
+                    } else {
+                        doubleByteCharCount++;
+                        if (commonChars != null) {
+                            // NOTE: This assumes that there are no 4-byte common chars.
+                            if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
+                                commonCharCount++;
+                            }
+                        }
+                    }
+                }
+                if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
+                    // Bail out early if the byte data is not matching the encoding scheme.
+                    break detectBlock;
+                }
+            }
+
+            if (doubleByteCharCount <= 10 && badCharCount == 0) {
+                // Not many multi-byte chars.
+                if (doubleByteCharCount == 0 && totalCharCount < 10) {
+                    // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
+                    // We don't have enough data to have any confidence.
+                    // Statistical analysis of single byte non-ASCII charcters would probably help here.
+                    confidence = 0;
+                } else {
+                    //   ASCII or ISO file?  It's probably not our encoding,
+                    //   but is not incompatible with our encoding, so don't give it a zero.
+                    confidence = 10;
+                }
+
+                break detectBlock;
+            }
+
+            //
+            //  No match if there are too many characters that don't fit the encoding scheme.
+            //    (should we have zero tolerance for these?)
+            //
+            if (doubleByteCharCount < 20 * badCharCount) {
+                confidence = 0;
+                break detectBlock;
+            }
+
+            if (commonChars == null) {
+                // We have no statistics on frequently occuring characters.
+                //  Assess confidence purely on having a reasonable number of
+                //  multi-byte characters (the more the better
+                confidence = 30 + doubleByteCharCount - 20 * badCharCount;
+                if (confidence > 100) {
+                    confidence = 100;
+                }
+            } else {
+                //
+                // Frequency of occurence statistics exist.
+                //
+                double maxVal = Math.log((float) doubleByteCharCount / 4);
+                double scaleFactor = 90.0 / maxVal;
+                confidence = (int) (Math.log(commonCharCount + 1) * scaleFactor + 10);
+                confidence = Math.min(confidence, 100);
+            }
+        }   // end of detectBlock:
+
+        return confidence;
+    }
+
+    /**
+     * Get the next character (however many bytes it is) from the input data
+     * Subclasses for specific charset encodings must implement this function
+     * to get characters according to the rules of their encoding scheme.
+     * <p/>
+     * This function is not a method of class iteratedChar only because
+     * that would require a lot of extra derived classes, which is awkward.
+     *
+     * @param it  The iteratedChar "struct" into which the returned char is placed.
+     * @param det The charset detector, which is needed to get at the input byte data
+     *            being iterated over.
+     * @return True if a character was returned, false at end of input.
+     */
+    abstract boolean nextChar(iteratedChar it, CharsetDetector det);
+
+    // "Character"  iterated character class.
+    //    Recognizers for specific mbcs encodings make their "characters" available
+    //    by providing a nextChar() function that fills in an instance of iteratedChar
+    //    with the next char from the input.
+    //    The returned characters are not converted to Unicode, but remain as the raw
+    //    bytes (concatenated into an int) from the codepage data.
+    //
+    //  For Asian charsets, use the raw input rather than the input that has been
+    //   stripped of markup.  Detection only considers multi-byte chars, effectively
+    //   stripping markup anyway, and double byte chars do occur in markup too.
+    //
+    static class iteratedChar {
+        int charValue = 0;             // 1-4 bytes from the raw input data
+        int index = 0;
+        int nextIndex = 0;
+        boolean error = false;
+        boolean done = false;
+
+        void reset() {
+            charValue = 0;
+            index = -1;
+            nextIndex = 0;
+            error = false;
+            done = false;
+        }
+
+        int nextByte(CharsetDetector det) {
+            if (nextIndex >= det.fRawLength) {
+                done = true;
+                return -1;
+            }
+            int byteValue = (int) det.fRawInput[nextIndex++] & 0x00ff;
+            return byteValue;
+        }
+    }
+
+    /**
+     * Shift-JIS charset recognizer.
+     */
+    static class CharsetRecog_sjis extends CharsetRecog_mbcs {
+        static int[] commonChars =
+                // TODO:  This set of data comes from the character frequency-
+                //        of-occurence analysis tool.  The data needs to be moved
+                //        into a resource and loaded from there.
+                {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
+                        0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
+                        0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
+                        0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
+                        0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
+                        0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
+
+        boolean nextChar(iteratedChar it, CharsetDetector det) {
+            it.index = it.nextIndex;
+            it.error = false;
+            int firstByte;
+            firstByte = it.charValue = it.nextByte(det);
+            if (firstByte < 0) {
+                return false;
+            }
+
+            if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) {
+                return true;
+            }
+
+            int secondByte = it.nextByte(det);
+            if (secondByte < 0) {
+                return false;
+            }
+            it.charValue = (firstByte << 8) | secondByte;
+            if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
+                // Illegal second byte value.
+                it.error = true;
+            }
+            return true;
+        }
+
+        int match(CharsetDetector det) {
+            return match(det, commonChars);
+        }
+
+        String getName() {
+            return "Shift_JIS";
+        }
+
+        public String getLanguage() {
+            return "ja";
+        }
+
+
+    }
+
+
+    /**
+     * Big5 charset recognizer.
+     */
+    static class CharsetRecog_big5 extends CharsetRecog_mbcs {
+        static int[] commonChars =
+                // TODO:  This set of data comes from the character frequency-
+                //        of-occurence analysis tool.  The data needs to be moved
+                //        into a resource and loaded from there.
+                {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
+                        0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
+                        0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
+                        0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
+                        0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
+                        0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
+                        0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
+                        0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
+                        0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
+                        0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
+
+        boolean nextChar(iteratedChar it, CharsetDetector det) {
+            it.index = it.nextIndex;
+            it.error = false;
+            int firstByte;
+            firstByte = it.charValue = it.nextByte(det);
+            if (firstByte < 0) {
+                return false;
+            }
+
+            if (firstByte <= 0x7f || firstByte == 0xff) {
+                // single byte character.
+                return true;
+            }
+
+            int secondByte = it.nextByte(det);
+            if (secondByte < 0) {
+                return false;
+            }
+            it.charValue = (it.charValue << 8) | secondByte;
+
+            if (secondByte < 0x40 ||
+                    secondByte == 0x7f ||
+                    secondByte == 0xff) {
+                it.error = true;
+            }
+            return true;
+        }
+
+        int match(CharsetDetector det) {
+            return match(det, commonChars);
+        }
+
+        String getName() {
+            return "Big5";
+        }
+
+
+        public String getLanguage() {
+            return "zh";
+        }
+    }
+
+
+    /**
+     * EUC charset recognizers.  One abstract class that provides the common function
+     * for getting the next character according to the EUC encoding scheme,
+     * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
+     */
+    abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
+
+        /*
+         *  (non-Javadoc)
+         *  Get the next character value for EUC based encodings.
+         *  Character "value" is simply the raw bytes that make up the character
+         *     packed into an int.
+         */
+        boolean nextChar(iteratedChar it, CharsetDetector det) {
+            it.index = it.nextIndex;
+            it.error = false;
+            int firstByte = 0;
+            int secondByte = 0;
+            int thirdByte = 0;
+            //int fourthByte = 0;
+
+            buildChar:
+            {
+                firstByte = it.charValue = it.nextByte(det);
+                if (firstByte < 0) {
+                    // Ran off the end of the input data
+                    it.done = true;
+                    break buildChar;
+                }
+                if (firstByte <= 0x8d) {
+                    // single byte char
+                    break buildChar;
+                }
+
+                secondByte = it.nextByte(det);
+                it.charValue = (it.charValue << 8) | secondByte;
+
+                if (firstByte >= 0xA1 && firstByte <= 0xfe) {
+                    // Two byte Char
+                    if (secondByte < 0xa1) {
+                        it.error = true;
+                    }
+                    break buildChar;
+                }
+                if (firstByte == 0x8e) {
+                    // Code Set 2.
+                    //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
+                    //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
+                    // We don't know which we've got.
+                    // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
+                    //   bytes will look like a well formed 2 byte char.
+                    if (secondByte < 0xa1) {
+                        it.error = true;
+                    }
+                    break buildChar;
+                }
+
+                if (firstByte == 0x8f) {
+                    // Code set 3.
+                    // Three byte total char size, two bytes of actual char value.
+                    thirdByte = it.nextByte(det);
+                    it.charValue = (it.charValue << 8) | thirdByte;
+                    if (thirdByte < 0xa1) {
+                        it.error = true;
+                    }
+                }
+            }
+
+            return (it.done == false);
+        }
+
+        /**
+         * The charset recognize for EUC-JP.  A singleton instance of this class
+         * is created and kept by the public CharsetDetector class
+         */
+        static class CharsetRecog_euc_jp extends CharsetRecog_euc {
+            static int[] commonChars =
+                    // TODO:  This set of data comes from the character frequency-
+                    //        of-occurence analysis tool.  The data needs to be moved
+                    //        into a resource and loaded from there.
+                    {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
+                            0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
+                            0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
+                            0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
+                            0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
+                            0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
+                            0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
+                            0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
+                            0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
+                            0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
+
+            String getName() {
+                return "EUC-JP";
+            }
+
+            int match(CharsetDetector det) {
+                return match(det, commonChars);
+            }
+
+            public String getLanguage() {
+                return "ja";
+            }
+        }
+
+        /**
+         * The charset recognize for EUC-KR.  A singleton instance of this class
+         * is created and kept by the public CharsetDetector class
+         */
+        static class CharsetRecog_euc_kr extends CharsetRecog_euc {
+            static int[] commonChars =
+                    // TODO:  This set of data comes from the character frequency-
+                    //        of-occurence analysis tool.  The data needs to be moved
+                    //        into a resource and loaded from there.
+                    {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
+                            0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
+                            0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
+                            0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
+                            0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
+                            0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
+                            0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
+                            0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
+                            0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
+                            0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
+
+            String getName() {
+                return "EUC-KR";
+            }
+
+            int match(CharsetDetector det) {
+                return match(det, commonChars);
+            }
+
+            public String getLanguage() {
+                return "ko";
+            }
+        }
+    }
+
+    /**
+     * GB-18030 recognizer. Uses simplified Chinese statistics.
+     */
+    static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
+
+        static int[] commonChars =
+                // TODO:  This set of data comes from the character frequency-
+                //        of-occurence analysis tool.  The data needs to be moved
+                //        into a resource and loaded from there.
+                {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
+                        0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
+                        0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
+                        0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
+                        0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
+                        0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
+                        0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
+                        0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
+                        0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
+                        0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
+
+        /*
+         *  (non-Javadoc)
+         *  Get the next character value for EUC based encodings.
+         *  Character "value" is simply the raw bytes that make up the character
+         *     packed into an int.
+         */
+        boolean nextChar(iteratedChar it, CharsetDetector det) {
+            it.index = it.nextIndex;
+            it.error = false;
+            int firstByte = 0;
+            int secondByte = 0;
+            int thirdByte = 0;
+            int fourthByte = 0;
+
+            buildChar:
+            {
+                firstByte = it.charValue = it.nextByte(det);
+
+                if (firstByte < 0) {
+                    // Ran off the end of the input data
+                    it.done = true;
+                    break buildChar;
+                }
+
+                if (firstByte <= 0x80) {
+                    // single byte char
+                    break buildChar;
+                }
+
+                secondByte = it.nextByte(det);
+                it.charValue = (it.charValue << 8) | secondByte;
+
+                if (firstByte >= 0x81 && firstByte <= 0xFE) {
+                    // Two byte Char
+                    if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) {
+                        break buildChar;
+                    }
+
+                    // Four byte char
+                    if (secondByte >= 0x30 && secondByte <= 0x39) {
+                        thirdByte = it.nextByte(det);
+
+                        if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
+                            fourthByte = it.nextByte(det);
+
+                            if (fourthByte >= 0x30 && fourthByte <= 0x39) {
+                                it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
+                                break buildChar;
+                            }
+                        }
+                    }
+
+                    it.error = true;
+                    break buildChar;
+                }
+            }
+
+            return (it.done == false);
+        }
+
+        String getName() {
+            return "GB18030";
+        }
+
+        int match(CharsetDetector det) {
+            return match(det, commonChars);
+        }
+
+        public String getLanguage() {
+            return "zh";
+        }
+    }
+
+
+}

[21/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
index cc17459..a8fe200 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
@@ -1,155 +1,155 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.opc;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Locale;
-import java.util.regex.Pattern;
-
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Detector that detects OPC Packages
- *
- */
-public class OPCDetector implements Detector {
-
-    /**
-     * 
-     */
-    private static final long serialVersionUID = -3569622763024617244L;
-    
-    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
-
-    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
-    private static final String VISIO_DOCUMENT =
-            "http://schemas.microsoft.com/visio/2010/relationships/document";
-    
-    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
-    private static final String STRICT_CORE_DOCUMENT = 
-            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
-    
-    @Override
-    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            TikaInputStream stream = TikaInputStream.get(input, tmp);
-            // Use POI to open and investigate it for us
-            OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
-            stream.setOpenContainer(pkg);
-    
-            // Is at an OOXML format?
-            MediaType type = detectOfficeOpenXML(pkg);
-            if (type != null) return type;
-            
-            // Is it XPS format?
-            type = detectXPSOPC(pkg);
-            if (type != null) return type;
-            
-            // Is it an AutoCAD format?
-            type = detectAutoCADOPC(pkg);
-            
-            return type;
-        } catch (InvalidFormatException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }finally {
-            tmp.close();
-        }
-        return null;
-    }
-    
-    /**
-     * Detects the type of an OfficeOpenXML (OOXML) file from
-     *  opened Package 
-     */
-    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
-        // Check for the normal Office core document
-        PackageRelationshipCollection core = 
-               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
-        // Otherwise check for some other Office core document types
-        if (core.size() == 0) {
-            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
-        }
-        if (core.size() == 0) {
-            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
-        }
-        
-        // If we didn't find a single core document of any type, skip detection
-        if (core.size() != 1) {
-            // Invalid OOXML Package received
-            return null;
-        }
-
-        // Get the type of the core document part
-        PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        String coreType = corePart.getContentType();
-
-        // Turn that into the type of the overall document
-        String docType = coreType.substring(0, coreType.lastIndexOf('.'));
-
-        // The Macro Enabled formats are a little special
-        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
-            docType = docType.toLowerCase(Locale.ROOT) + ".12";
-        }
-
-        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
-            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
-        }
-
-        // Build the MediaType object and return
-        return MediaType.parse(docType);
-    }
-    /**
-     * Detects Open XML Paper Specification (XPS)
-     */
-    private static MediaType detectXPSOPC(OPCPackage pkg) {
-        PackageRelationshipCollection xps = 
-                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
-        if (xps.size() == 1) {
-            return MediaType.application("vnd.ms-xpsdocument");
-        } else {
-            // Non-XPS Package received
-            return null;
-        }
-    }
-    /**
-     * Detects AutoCAD formats that live in OPC packaging
-     */
-    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
-        PackageRelationshipCollection dwfxSeq = 
-                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
-        if (dwfxSeq.size() == 1) {
-            return MediaType.parse("model/vnd.dwfx+xps");
-        } else {
-            // Non-AutoCAD Package received
-            return null;
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector that detects OPC Packages
+ *
+ */
+public class OPCDetector implements Detector {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = -3569622763024617244L;
+    
+    private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String VISIO_DOCUMENT =
+            "http://schemas.microsoft.com/visio/2010/relationships/document";
+    
+    // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
+    private static final String STRICT_CORE_DOCUMENT = 
+            "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+    
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream stream = TikaInputStream.get(input, tmp);
+            // Use POI to open and investigate it for us
+            OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+            stream.setOpenContainer(pkg);
+    
+            // Is at an OOXML format?
+            MediaType type = detectOfficeOpenXML(pkg);
+            if (type != null) return type;
+            
+            // Is it XPS format?
+            type = detectXPSOPC(pkg);
+            if (type != null) return type;
+            
+            // Is it an AutoCAD format?
+            type = detectAutoCADOPC(pkg);
+            
+            return type;
+        } catch (InvalidFormatException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }finally {
+            tmp.close();
+        }
+        return null;
+    }
+    
+    /**
+     * Detects the type of an OfficeOpenXML (OOXML) file from
+     *  opened Package 
+     */
+    public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+        // Check for the normal Office core document
+        PackageRelationshipCollection core = 
+               pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+        // Otherwise check for some other Office core document types
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+        }
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+        }
+        
+        // If we didn't find a single core document of any type, skip detection
+        if (core.size() != 1) {
+            // Invalid OOXML Package received
+            return null;
+        }
+
+        // Get the type of the core document part
+        PackagePart corePart = pkg.getPart(core.getRelationship(0));
+        String coreType = corePart.getContentType();
+
+        // Turn that into the type of the overall document
+        String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+        // The Macro Enabled formats are a little special
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+            docType = docType.toLowerCase(Locale.ROOT) + ".12";
+        }
+
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+            docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+        }
+
+        // Build the MediaType object and return
+        return MediaType.parse(docType);
+    }
+    /**
+     * Detects Open XML Paper Specification (XPS)
+     */
+    private static MediaType detectXPSOPC(OPCPackage pkg) {
+        PackageRelationshipCollection xps = 
+                pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+        if (xps.size() == 1) {
+            return MediaType.application("vnd.ms-xpsdocument");
+        } else {
+            // Non-XPS Package received
+            return null;
+        }
+    }
+    /**
+     * Detects AutoCAD formats that live in OPC packaging
+     */
+    private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+        PackageRelationshipCollection dwfxSeq = 
+                pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+        if (dwfxSeq.size() == 1) {
+            return MediaType.parse("model/vnd.dwfx+xps");
+        } else {
+            // Non-AutoCAD Package received
+            return null;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
index e5beb4b..90f2d2e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
@@ -1,28 +1,28 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.opendocument;
-
-import org.apache.tika.parser.odf.OpenDocumentParser;
-
-/**
- * OpenOffice parser
- *
- * @deprecated Use the {@link OpenDocumentParser} class instead.
- *             This class will be removed in Apache Tika 1.0.
- */
-public class OpenOfficeParser extends OpenDocumentParser {
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import org.apache.tika.parser.odf.OpenDocumentParser;
+
+/**
+ * OpenOffice parser
+ *
+ * @deprecated Use the {@link OpenDocumentParser} class instead.
+ *             This class will be removed in Apache Tika 1.0.
+ */
+public class OpenOfficeParser extends OpenDocumentParser {
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
index eba9d8c..4a9a1d1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
@@ -1,67 +1,67 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import java.nio.charset.Charset;
-
-/* Holds all state associated with current RTF group, ie {
- * ... }. */
-
-class GroupState {
-    public int depth;
-    public boolean bold;
-    public boolean italic;
-    // True if we are skipping all text in current group,
-    // eg if group leads with a \*:
-    public boolean ignore;
-    // Default is 1 if no uc control has been seen yet:
-    public int ucSkip = 1;
-    public int list;
-    public int listLevel;
-    public Charset fontCharset;
-    //in objdata
-    public boolean objdata;
-    //depth in pict, 1 = at pict level
-    public int pictDepth;
-    //in picprop key/value pair
-    public boolean sp;
-    //in picprop's name 
-    public boolean sn;
-    //in picprop's value
-    public boolean sv;
-    //in embedded object or not
-    public boolean object;
-
-    // Create default (root) GroupState
-    public GroupState() {
-    }
-
-    // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
-    public GroupState(GroupState other) {
-        bold = other.bold;
-        italic = other.italic;
-        ignore = other.ignore;
-        ucSkip = other.ucSkip;
-        list = other.list;
-        listLevel = other.listLevel;
-        fontCharset = other.fontCharset;
-        depth = 1 + other.depth;
-        pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
-        //do not inherit object, sn, sv or sp
-
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.nio.charset.Charset;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+    public int depth;
+    public boolean bold;
+    public boolean italic;
+    // True if we are skipping all text in current group,
+    // eg if group leads with a \*:
+    public boolean ignore;
+    // Default is 1 if no uc control has been seen yet:
+    public int ucSkip = 1;
+    public int list;
+    public int listLevel;
+    public Charset fontCharset;
+    //in objdata
+    public boolean objdata;
+    //depth in pict, 1 = at pict level
+    public int pictDepth;
+    //in picprop key/value pair
+    public boolean sp;
+    //in picprop's name 
+    public boolean sn;
+    //in picprop's value
+    public boolean sv;
+    //in embedded object or not
+    public boolean object;
+
+    // Create default (root) GroupState
+    public GroupState() {
+    }
+
+    // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+    public GroupState(GroupState other) {
+        bold = other.bold;
+        italic = other.italic;
+        ignore = other.ignore;
+        ucSkip = other.ucSkip;
+        list = other.list;
+        listLevel = other.listLevel;
+        fontCharset = other.fontCharset;
+        depth = 1 + other.depth;
+        pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
+        //do not inherit object, sn, sv or sp
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
index 1931232..e7142bd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
@@ -1,35 +1,35 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-/**
- * Contains the information for a single list in the list or list override tables.
- */
-public class ListDescriptor {
-    public final static int NUMBER_TYPE_BULLET = 23;
-
-    public int id;
-    // We record this but don't make use if it today:
-    public int templateID;
-    // We record this but don't make use if it today:
-    public boolean isStyle;
-    public int[] numberType = new int[9];
-
-    public boolean isUnordered(int level) {
-        return numberType[level] == NUMBER_TYPE_BULLET;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/**
+ * Contains the information for a single list in the list or list override tables.
+ */
+public class ListDescriptor {
+    public final static int NUMBER_TYPE_BULLET = 23;
+
+    public int id;
+    // We record this but don't make use if it today:
+    public int templateID;
+    // We record this but don't make use if it today:
+    public boolean isStyle;
+    public int[] numberType = new int[9];
+
+    public boolean isUnordered(int level) {
+        return numberType[level] == NUMBER_TYPE_BULLET;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index ccd7e7f..d2c448b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.TaggedInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * RTF parser
- */
-public class RTFParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -4165069489372320313L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MediaType.application("rtf"));
-    /**
-     * maximum number of bytes per embedded object/pict (default: 20MB)
-     */
-    private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
-
-    /**
-     * See {@link #setMaxBytesForEmbeddedObject(int)}.
-     *
-     * @return maximum number of bytes allowed for an embedded object.
-     */
-    public static int getMaxBytesForEmbeddedObject() {
-        return EMB_OBJ_MAX_BYTES;
-    }
-
-    /**
-     * Bytes for embedded objects are currently cached in memory.
-     * If something goes wrong during the parsing of an embedded object,
-     * it is possible that a read length may be crazily too long
-     * and cause a heap crash.
-     *
-     * @param max maximum number of bytes to allow for embedded objects.  If
-     *            the embedded object has more than this number of bytes, skip it.
-     */
-    public static void setMaxBytesForEmbeddedObject(int max) {
-        EMB_OBJ_MAX_BYTES = max;
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
-        TaggedInputStream tagged = new TaggedInputStream(stream);
-        try {
-            XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
-            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
-            final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
-            ert.extract(stream);
-        } catch (IOException e) {
-            tagged.throwIfCauseOf(e);
-            throw new TikaException("Error parsing an RTF document", e);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * RTF parser
+ */
+public class RTFParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -4165069489372320313L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("rtf"));
+    /**
+     * maximum number of bytes per embedded object/pict (default: 20MB)
+     */
+    private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
+
+    /**
+     * See {@link #setMaxBytesForEmbeddedObject(int)}.
+     *
+     * @return maximum number of bytes allowed for an embedded object.
+     */
+    public static int getMaxBytesForEmbeddedObject() {
+        return EMB_OBJ_MAX_BYTES;
+    }
+
+    /**
+     * Bytes for embedded objects are currently cached in memory.
+     * If something goes wrong during the parsing of an embedded object,
+     * it is possible that a read length may be crazily too long
+     * and cause a heap crash.
+     *
+     * @param max maximum number of bytes to allow for embedded objects.  If
+     *            the embedded object has more than this number of bytes, skip it.
+     */
+    public static void setMaxBytesForEmbeddedObject(int max) {
+        EMB_OBJ_MAX_BYTES = max;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
+        TaggedInputStream tagged = new TaggedInputStream(stream);
+        try {
+            XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
+            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+            final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+            ert.extract(stream);
+        } catch (IOException e) {
+            tagged.throwIfCauseOf(e);
+            throw new TikaException("Error parsing an RTF document", e);
+        }
+    }
+}

[35/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
index 74c3360..3a8a66c 100644
--- a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
+++ b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-/**
- * Test case for parsing Java class files.
- */
-public class ClassParserTest {
-
-    @Test
-    public void testClassParsing() throws Exception {
-        String path = "/test-documents/AutoDetectParser.class";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                ClassParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals(
-                "AutoDetectParser.class",
-                metadata.get(Metadata.RESOURCE_NAME_KEY));
-
-        assertTrue(content.contains("package org.apache.tika.parser;"));
-        assertTrue(content.contains(
-                "class AutoDetectParser extends CompositeParser"));
-        assertTrue(content.contains(
-                "private org.apache.tika.mime.MimeTypes types"));
-        assertTrue(content.contains(
-                "public void parse("
-                + "java.io.InputStream, org.xml.sax.ContentHandler,"
-                + " org.apache.tika.metadata.Metadata) throws"
-                + " java.io.IOException, org.xml.sax.SAXException,"
-                + " org.apache.tika.exception.TikaException;"));
-        assertTrue(content.contains(
-                "private byte[] getPrefix(java.io.InputStream, int)"
-                + " throws java.io.IOException;"));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+/**
+ * Test case for parsing Java class files.
+ */
+public class ClassParserTest {
+
+    @Test
+    public void testClassParsing() throws Exception {
+        String path = "/test-documents/AutoDetectParser.class";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                ClassParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(
+                "AutoDetectParser.class",
+                metadata.get(Metadata.RESOURCE_NAME_KEY));
+
+        assertTrue(content.contains("package org.apache.tika.parser;"));
+        assertTrue(content.contains(
+                "class AutoDetectParser extends CompositeParser"));
+        assertTrue(content.contains(
+                "private org.apache.tika.mime.MimeTypes types"));
+        assertTrue(content.contains(
+                "public void parse("
+                + "java.io.InputStream, org.xml.sax.ContentHandler,"
+                + " org.apache.tika.metadata.Metadata) throws"
+                + " java.io.IOException, org.xml.sax.SAXException,"
+                + " org.apache.tika.exception.TikaException;"));
+        assertTrue(content.contains(
+                "private byte[] getPrefix(java.io.InputStream, int)"
+                + " throws java.io.IOException;"));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index ae762dc..17aca8b 100644
--- a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -1,101 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.code;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.util.Set;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Test;
-
-public class SourceCodeParserTest extends TikaTest {
-
-  private SourceCodeParser sourceCodeParser = new SourceCodeParser();
-
-  @Test
-  public void testSupportTypes() throws Exception {
-    Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
-
-    assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
-  }
-
-  @Test
-  public void testHTMLRenderWithReturnLine() throws Exception {
-    String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
-    
-    assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
-    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
-    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
-    assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
-  }
-  
-  @Test
-  public void testTextRender() throws Exception {
-    String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
-    
-    assertTrue(textContent.length() > 0);
-    assertTrue(textContent.indexOf("html") < 0);
-    
-    textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
-    assertTrue(textContent.length() > 0);
-    assertTrue(textContent.indexOf("html") < 0);
-  }
-
-  @Test
-  public void testLoC() throws Exception {
-    Metadata metadata = createMetadata("text/x-groovy");
-    getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
-
-    assertEquals(metadata.get("LoC"), "9");
-  }
-
-  @Test
-  public void testAuthor() throws Exception {
-    Metadata metadata = createMetadata("text/x-c++src");
-    getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
-
-    assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
-  }
-
-  @Test
-  public void testReturnContentAsIsForTextHandler() throws Exception {
-    String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
-
-    assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
-  }
-
-  private Metadata createMetadata(String mimeType) {
-    Metadata metadata = new Metadata();
-    metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
-    metadata.add(Metadata.CONTENT_TYPE, mimeType);
-    return metadata;
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class SourceCodeParserTest extends TikaTest {
+
+  private SourceCodeParser sourceCodeParser = new SourceCodeParser();
+
+  @Test
+  public void testSupportTypes() throws Exception {
+    Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
+
+    assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
+  }
+
+  @Test
+  public void testHTMLRenderWithReturnLine() throws Exception {
+    String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
+    
+    assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+    assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+  }
+  
+  @Test
+  public void testTextRender() throws Exception {
+    String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+    
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
+    
+    textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
+  }
+
+  @Test
+  public void testLoC() throws Exception {
+    Metadata metadata = createMetadata("text/x-groovy");
+    getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
+
+    assertEquals(metadata.get("LoC"), "9");
+  }
+
+  @Test
+  public void testAuthor() throws Exception {
+    Metadata metadata = createMetadata("text/x-c++src");
+    getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+
+    assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
+  }
+
+  @Test
+  public void testReturnContentAsIsForTextHandler() throws Exception {
+    String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+
+    assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
+  }
+
+  private Metadata createMetadata(String mimeType) {
+    Metadata metadata = new Metadata();
+    metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
+    metadata.add(Metadata.CONTENT_TYPE, mimeType);
+    return metadata;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/pom.xml b/tika-parser-modules/tika-parser-crypto-module/pom.xml
index 69dd7a9..23a5417 100644
--- a/tika-parser-modules/tika-parser-crypto-module/pom.xml
+++ b/tika-parser-modules/tika-parser-crypto-module/pom.xml
@@ -1,53 +1,53 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-crypto-module</artifactId>
-  <name>Apache Tika parser crypto module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcmail-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-crypto-module</artifactId>
+  <name>Apache Tika parser crypto module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcmail-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java b/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
index f71fb51..e63c276 100644
--- a/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.crypto.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.crypto.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java b/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
index 02ff3dd..bc39042 100644
--- a/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
+++ b/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
@@ -1,47 +1,47 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.crypto;
-
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class Pkcs7ParserTest extends TikaTest {
-    @Test
-    public void testDetachedSignature() throws Exception {
-        try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
-                "/test-documents/testDetached.p7s")) {
-            ContentHandler handler = new BodyContentHandler();
-            Metadata metadata = new Metadata();
-            new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
-        } catch (NullPointerException npe) {
-            fail("should not get NPE");
-        } catch (TikaException te) {
-            assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
-        }
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.crypto;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class Pkcs7ParserTest extends TikaTest {
+    @Test
+    public void testDetachedSignature() throws Exception {
+        try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
+                "/test-documents/testDetached.p7s")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
+        } catch (NullPointerException npe) {
+            fail("should not get NPE");
+        } catch (TikaException te) {
+            assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-database-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/pom.xml b/tika-parser-modules/tika-parser-database-module/pom.xml
index a60dae3..cdbbaad 100644
--- a/tika-parser-modules/tika-parser-database-module/pom.xml
+++ b/tika-parser-modules/tika-parser-database-module/pom.xml
@@ -1,67 +1,67 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-database-module</artifactId>
-  <name>Apache Tika parser database module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <!-- Provided dependencies -->
-    <dependency>
-      <groupId>org.xerial</groupId>
-      <artifactId>sqlite-jdbc</artifactId>
-      <version>3.8.11.2</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-office-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-database-module</artifactId>
+  <name>Apache Tika parser database module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <!-- Provided dependencies -->
+    <dependency>
+      <groupId>org.xerial</groupId>
+      <artifactId>sqlite-jdbc</artifactId>
+      <version>3.8.11.2</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-office-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
index 4b798fa..e66cab3 100644
--- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.database.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.database.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/pom.xml b/tika-parser-modules/tika-parser-ebook-module/pom.xml
index 89bab53..0c21ee9 100644
--- a/tika-parser-modules/tika-parser-ebook-module/pom.xml
+++ b/tika-parser-modules/tika-parser-ebook-module/pom.xml
@@ -1,48 +1,48 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-ebook-module</artifactId>
-  <name>Apache Tika parser e-Book module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-ebook-module</artifactId>
+  <name>Apache Tika parser e-Book module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
index 62e1582..313de08 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.ebook.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.ebook.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
index ab55e5e..94b5caa 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import javax.xml.parsers.SAXParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for EPUB OPS <code>*.html</code> files.
- *
- * For the time being, assume XHTML (TODO: DTBook)
- */
-public class EpubContentParser extends AbstractParser {
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return Collections.emptySet(); // not a top-level parser
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        final XHTMLContentHandler xhtml =
-                new XHTMLContentHandler(handler, metadata);
-        SAXParser parser = context.getSAXParser();
-        parser.parse(
-                new CloseShieldInputStream(stream),
-                new OfflineContentHandler(xhtml));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import javax.xml.parsers.SAXParser;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for EPUB OPS <code>*.html</code> files.
+ *
+ * For the time being, assume XHTML (TODO: DTBook)
+ */
+public class EpubContentParser extends AbstractParser {
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.emptySet(); // not a top-level parser
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        final XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+        SAXParser parser = context.getSAXParser();
+        parser.parse(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(xhtml));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 14e6cf8..c4f72de 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -1,119 +1,119 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.xml.DcXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Epub parser
- */
-public class EpubParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 215176772484050550L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-            		MediaType.application("epub+zip"),
-                  MediaType.application("x-ibooks+zip")
-            )));
-
-    private Parser meta = new DcXMLParser();
-
-    private Parser content = new EpubContentParser();
-
-    public Parser getMetaParser() {
-        return meta;
-    }
-
-    public void setMetaParser(Parser meta) {
-        this.meta = meta;
-    }
-
-    public Parser getContentParser() {
-        return content;
-    }
-
-    public void setContentParser(Parser content) {
-        this.content = content;
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // Because an EPub file is often made up of multiple XHTML files,
-        //  we need explicit control over the start and end of the document
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        ContentHandler childHandler = new EmbeddedContentHandler(
-              new BodyContentHandler(xhtml));
-       
-        ZipInputStream zip = new ZipInputStream(stream);
-        ZipEntry entry = zip.getNextEntry();
-        while (entry != null) {
-            if (entry.getName().equals("mimetype")) {
-                String type = IOUtils.toString(zip, UTF_8);
-                //often has trailing new lines
-                if (type != null) {
-                    type = type.trim();
-                }
-                metadata.set(Metadata.CONTENT_TYPE, type);
-            } else if (entry.getName().equals("metadata.xml")) {
-                meta.parse(zip, new DefaultHandler(), metadata, context);
-            } else if (entry.getName().endsWith(".opf")) {
-                meta.parse(zip, new DefaultHandler(), metadata, context);
-            } else if (entry.getName().endsWith(".html") || 
-            		   entry.getName().endsWith(".xhtml")) {
-                content.parse(zip, childHandler, metadata, context);
-            }
-            entry = zip.getNextEntry();
-        }
-        
-        // Finish everything
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.xml.DcXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Epub parser
+ */
+public class EpubParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 215176772484050550L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+            		MediaType.application("epub+zip"),
+                  MediaType.application("x-ibooks+zip")
+            )));
+
+    private Parser meta = new DcXMLParser();
+
+    private Parser content = new EpubContentParser();
+
+    public Parser getMetaParser() {
+        return meta;
+    }
+
+    public void setMetaParser(Parser meta) {
+        this.meta = meta;
+    }
+
+    public Parser getContentParser() {
+        return content;
+    }
+
+    public void setContentParser(Parser content) {
+        this.content = content;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Because an EPub file is often made up of multiple XHTML files,
+        //  we need explicit control over the start and end of the document
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        ContentHandler childHandler = new EmbeddedContentHandler(
+              new BodyContentHandler(xhtml));
+       
+        ZipInputStream zip = new ZipInputStream(stream);
+        ZipEntry entry = zip.getNextEntry();
+        while (entry != null) {
+            if (entry.getName().equals("mimetype")) {
+                String type = IOUtils.toString(zip, UTF_8);
+                //often has trailing new lines
+                if (type != null) {
+                    type = type.trim();
+                }
+                metadata.set(Metadata.CONTENT_TYPE, type);
+            } else if (entry.getName().equals("metadata.xml")) {
+                meta.parse(zip, new DefaultHandler(), metadata, context);
+            } else if (entry.getName().endsWith(".opf")) {
+                meta.parse(zip, new DefaultHandler(), metadata, context);
+            } else if (entry.getName().endsWith(".html") || 
+            		   entry.getName().endsWith(".xhtml")) {
+                content.parse(zip, childHandler, metadata, context);
+            }
+            entry = zip.getNextEntry();
+        }
+        
+        // Finish everything
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index dcc705e..c9acbeb 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -1,58 +1,58 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.tika.TikaTest.assertContains;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class EpubParserTest {
-
-    @Test
-    public void testXMLParser() throws Exception {
-        try (InputStream input = EpubParserTest.class.getResourceAsStream(
-                "/test-documents/testEPUB.epub")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new EpubParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals("application/epub+zip",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("en",
-                    metadata.get(TikaCoreProperties.LANGUAGE));
-            assertEquals("This is an ePub test publication for Tika.",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("Apache",
-                    metadata.get(TikaCoreProperties.PUBLISHER));
-
-            String content = handler.toString();
-            assertContains("Plus a simple div", content);
-            assertContains("First item", content);
-            assertContains("The previous headings were subchapters", content);
-            assertContains("Table data", content);
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EpubParserTest {
+
+    @Test
+    public void testXMLParser() throws Exception {
+        try (InputStream input = EpubParserTest.class.getResourceAsStream(
+                "/test-documents/testEPUB.epub")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("application/epub+zip",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("en",
+                    metadata.get(TikaCoreProperties.LANGUAGE));
+            assertEquals("This is an ePub test publication for Tika.",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Apache",
+                    metadata.get(TikaCoreProperties.PUBLISHER));
+
+            String content = handler.toString();
+            assertContains("Plus a simple div", content);
+            assertContains("First item", content);
+            assertContains("The previous headings were subchapters", content);
+            assertContains("Table data", content);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-journal-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-journal-module/pom.xml b/tika-parser-modules/tika-parser-journal-module/pom.xml
index 1a29605..c45c2a9 100644
--- a/tika-parser-modules/tika-parser-journal-module/pom.xml
+++ b/tika-parser-modules/tika-parser-journal-module/pom.xml
@@ -1,68 +1,68 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-journal-module</artifactId>
-  <name>Apache Tika parser journal module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <cxf.version>3.0.3</cxf.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.cxf</groupId>
-      <artifactId>cxf-rt-rs-client</artifactId>
-      <version>${cxf.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.json</groupId>
-      <artifactId>json</artifactId>
-      <version>20140107</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-journal-module</artifactId>
+  <name>Apache Tika parser journal module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <cxf.version>3.0.3</cxf.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.cxf</groupId>
+      <artifactId>cxf-rt-rs-client</artifactId>
+      <version>${cxf.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.json</groupId>
+      <artifactId>json</artifactId>
+      <version>20140107</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-pdf-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java b/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
index 2f9c36a..dda3e3a 100644
--- a/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.journal.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.journal.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 7a3a704..74cb504 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -1,101 +1,101 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-multimedia-module</artifactId>
-  <name>Apache Tika parser multimedia module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <metadata.extractor.version>2.8.1</metadata.extractor.version>
-    <isoparser.version>1.1.18</isoparser.version>
-    <commons.logging.version>1.1.3</commons.logging.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-xmp-commons</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.drewnoakes</groupId>
-      <artifactId>metadata-extractor</artifactId>
-      <version>${metadata.extractor.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>${codec.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-exec</artifactId>
-      <version>${commons.exec}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.googlecode.mp4parser</groupId>
-      <artifactId>isoparser</artifactId>
-      <version>${isoparser.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>fontbox</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>${commons.logging.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-web-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-office-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-multimedia-module</artifactId>
+  <name>Apache Tika parser multimedia module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <metadata.extractor.version>2.8.1</metadata.extractor.version>
+    <isoparser.version>1.1.18</isoparser.version>
+    <commons.logging.version>1.1.3</commons.logging.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-xmp-commons</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.drewnoakes</groupId>
+      <artifactId>metadata-extractor</artifactId>
+      <version>${metadata.extractor.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-exec</artifactId>
+      <version>${commons.exec}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.mp4parser</groupId>
+      <artifactId>isoparser</artifactId>
+      <version>${isoparser.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>fontbox</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>${commons.logging.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-web-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-office-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
index 7f53312..de4ae01 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.multimedia.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.multimedia.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

[17/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 196ffa9..4ea3fa1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -1,412 +1,412 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ExcelParserTest extends TikaTest {
-    @Test
-    @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
-    public void testExcelParser() throws Exception {
-
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
-
-        // Mon Oct 01 17:13:56 BST 2007
-        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
-
-        // Mon Oct 01 17:31:43 BST 2007
-        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
-
-        String content = r.xml;
-        assertContains("Sample Excel Worksheet", content);
-        assertContains("Numbers and their Squares", content);
-        assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
-        assertContains("9", content);
-        assertNotContained("9.0", content);
-        assertContains("196", content);
-        assertNotContained("196.0", content);
-
-    }
-
-    @Test
-    public void testExcelParserFormatting() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = r.xml;
-
-        // Number #,##0.00
-        assertContains("1,599.99", content);
-        assertContains("-1,599.99", content);
-
-        // Currency $#,##0.00;[Red]($#,##0.00)
-        assertContains("$1,599.99", content);
-        assertContains("($1,599.99)", content);
-
-        // Scientific 0.00E+00
-        // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
-        assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
-        assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
-
-        // Percentage.
-        assertContains("2.50%", content);
-        // Excel rounds up to 3%, but that requires Java 1.6 or later
-        if (System.getProperty("java.version").startsWith("1.5")) {
-            assertContains("2%", content);
-        } else {
-            assertContains("3%", content);
-        }
-
-        // Time Format: h:mm
-        assertContains("6:15", content);
-        assertContains("18:15", content);
-
-        // Date Format: d-mmm-yy
-        assertContains("17-May-07", content);
-
-        // Date Format: m/d/yy
-        assertContains("10/3/09", content);
-
-        // Date/Time Format: m/d/yy h:mm
-        assertContains("1/19/08 4:35", content);
-
-        // Fraction (2.5): # ?/?
-        assertContains("2 1/2", content);
-
-
-        // Below assertions represent outstanding formatting issues to be addressed
-        // they are included to allow the issues to be progressed with the Apache POI
-        // team - See TIKA-103.
-
-        /*************************************************************************
-         // Custom Number (0 "dollars and" .00 "cents")
-         assertContains("19 dollars and .99 cents", content);
-
-         // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
-         assertContains("At 4:20 AM on Thursday May 17, 2007", content);
-         **************************************************************************/
-
-
-    }
-
-    @Test
-    public void testExcelParserPassword() throws Exception {
-        try {
-            XMLResult r = getXML("testEXCEL_protected_passtika.xls");
-            fail("Document is encrypted, shouldn't parse");
-        } catch (EncryptedDocumentException e) {
-            // Good
-        }
-
-        // Try again, this time with the password
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        context.set(PasswordProvider.class, new PasswordProvider() {
-            @Override
-            public String getPassword(Metadata metadata) {
-                return "tika";
-            }
-        });
-        XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
-
-        String content = r.xml;
-        assertContains("This is an Encrypted Excel spreadsheet", content);
-        assertNotContained("9.0", content);
-
-    }
-
-    /**
-     * TIKA-214 - Ensure we extract labels etc from Charts
-     */
-    @Test
-    public void testExcelParserCharts() throws Exception {
-
-        XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = r.xml;
-
-        // The first sheet has a pie chart
-        assertContains("charttabyodawg", content);
-        assertContains("WhamPuff", content);
-
-        // The second sheet has a bar chart and some text
-        assertContains("Sheet1", content);
-        assertContains("Test Excel Spreasheet", content);
-        assertContains("foo", content);
-        assertContains("bar", content);
-        assertContains("fizzlepuff", content);
-        assertContains("whyaxis", content);
-        assertContains("eksaxis", content);
-
-        // The third sheet has some text
-        assertContains("Sheet2", content);
-        assertContains("dingdong", content);
-
-    }
-
-    @Test
-    public void testJXL() throws Exception {
-
-        XMLResult r = getXML("jxl.xls", new OfficeParser());
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-        assertContains("Number Formats", r.xml);
-
-    }
-
-    @Test
-    public void testWorksSpreadsheet70() throws Exception {
-        assertContains("Microsoft Works",
-                getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
-    }
-
-    /**
-     * We don't currently support the .xlsb file format 
-     *  (an OOXML container with binary blobs), but we 
-     *  shouldn't break on these files either (TIKA-826)  
-     */
-    @Test
-    public void testExcelXLSB() throws Exception {
-        Detector detector = new DefaultDetector();
-        AutoDetectParser parser = new AutoDetectParser();
-
-        Metadata m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
-        // Should be detected correctly
-        MediaType type;
-        try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
-        }
-
-        // OfficeParser won't handle it
-        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // OOXMLParser won't handle it
-        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // AutoDetectParser doesn't break on it
-        assertContains("<body />", getXML("testEXCEL.xlsb").xml);
-
-    }
-
-    /**
-     * Excel 5 and 95 are older formats, and only get basic support
-     */
-    @Test
-    public void testExcel95() throws Exception {
-        Detector detector = new DefaultDetector();
-        AutoDetectParser parser = new AutoDetectParser();
-        MediaType type;
-        Metadata m;
-
-        // First try detection of Excel 5
-        m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel", type.toString());
-        }
-
-        // Now Excel 95
-        m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel", type.toString());
-        }
-
-        // OfficeParser can handle it
-        assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // OOXMLParser won't handle it
-        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-
-        // Parse the Excel 5 file
-        m = new Metadata();
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            parser.parse(input, handler, m, context);
-
-            String content = handler.toString();
-
-            // Sheet names
-            assertContains("Feuil1", content);
-            assertContains("Feuil3", content);
-
-            // Text
-            assertContains("Sample Excel", content);
-            assertContains("Number", content);
-
-            // Numbers
-            assertContains("15", content);
-            assertContains("225", content);
-
-            // Metadata was also fetched
-            assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
-            assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
-        }
-
-        // Parse the Excel 95 file
-        m = new Metadata();
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            parser.parse(input, handler, m, context);
-
-            String content = handler.toString();
-
-            // Sheet name
-            assertContains("Foglio1", content);
-
-            // Very boring file, no actual text or numbers!
-
-            // Metadata was also fetched
-            assertEquals(null, m.get(TikaCoreProperties.TITLE));
-            assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
-        }
-    }
-
-    /**
-     * Ensures that custom OLE2 (HPSF) properties are extracted
-     */
-    @Test
-    public void testCustomProperties() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-
-        XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
-        Metadata metadata = r.metadata;
-        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
-        assertEquals("true", metadata.get("custom:myCustomBoolean"));
-        assertEquals("3", metadata.get("custom:myCustomNumber"));
-        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
-        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
-        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
-    }
-
-	@Test
-    public void testHeaderAndFooterExtraction() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.UK);
-
-        XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
-                new Metadata(), context);
-
-        Metadata metadata = r.metadata;
-        assertEquals(
-                "application/vnd.ms-excel",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
-
-        String content = r.xml;
-        assertContains("John Smith1", content);
-        assertContains("John Smith50", content);
-        assertContains("1 Corporate HQ", content);
-        assertContains("Header - Corporate Spreadsheet", content);
-        assertContains("Header - For Internal Use Only", content);
-        assertContains("Header - Author: John Smith", content);
-        assertContains("Footer - Corporate Spreadsheet", content);
-        assertContains("Footer - For Internal Use Only", content);
-        assertContains("Footer - Author: John Smith", content);
-
-    }
-
-    @Test
-    public void testHyperlinksInXLS() throws Exception {
-        String xml = getXML("testEXCEL_hyperlinks.xls").xml;
-        //external url
-        assertContains("<a href=\"http://tika.apache.org/\">", xml);
-        //mail url
-        assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
-        //external linked file
-        assertContains("<a href=\"linked_file.txt.htm\">", xml);
-
-        //TODO: not extracting these yet
-        //link on textbox
-//        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
-    }
-
-    @Test
-    public void testEmbeddedPDF() throws Exception {
-        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
-        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExcelParserTest extends TikaTest {
+    @Test
+    @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
+    public void testExcelParser() throws Exception {
+
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
+
+        // Mon Oct 01 17:13:56 BST 2007
+        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
+
+        // Mon Oct 01 17:31:43 BST 2007
+        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
+
+        String content = r.xml;
+        assertContains("Sample Excel Worksheet", content);
+        assertContains("Numbers and their Squares", content);
+        assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
+        assertContains("9", content);
+        assertNotContained("9.0", content);
+        assertContains("196", content);
+        assertNotContained("196.0", content);
+
+    }
+
+    @Test
+    public void testExcelParserFormatting() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = r.xml;
+
+        // Number #,##0.00
+        assertContains("1,599.99", content);
+        assertContains("-1,599.99", content);
+
+        // Currency $#,##0.00;[Red]($#,##0.00)
+        assertContains("$1,599.99", content);
+        assertContains("($1,599.99)", content);
+
+        // Scientific 0.00E+00
+        // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
+        assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
+        assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
+
+        // Percentage.
+        assertContains("2.50%", content);
+        // Excel rounds up to 3%, but that requires Java 1.6 or later
+        if (System.getProperty("java.version").startsWith("1.5")) {
+            assertContains("2%", content);
+        } else {
+            assertContains("3%", content);
+        }
+
+        // Time Format: h:mm
+        assertContains("6:15", content);
+        assertContains("18:15", content);
+
+        // Date Format: d-mmm-yy
+        assertContains("17-May-07", content);
+
+        // Date Format: m/d/yy
+        assertContains("10/3/09", content);
+
+        // Date/Time Format: m/d/yy h:mm
+        assertContains("1/19/08 4:35", content);
+
+        // Fraction (2.5): # ?/?
+        assertContains("2 1/2", content);
+
+
+        // Below assertions represent outstanding formatting issues to be addressed
+        // they are included to allow the issues to be progressed with the Apache POI
+        // team - See TIKA-103.
+
+        /*************************************************************************
+         // Custom Number (0 "dollars and" .00 "cents")
+         assertContains("19 dollars and .99 cents", content);
+
+         // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+         assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+         **************************************************************************/
+
+
+    }
+
+    @Test
+    public void testExcelParserPassword() throws Exception {
+        try {
+            XMLResult r = getXML("testEXCEL_protected_passtika.xls");
+            fail("Document is encrypted, shouldn't parse");
+        } catch (EncryptedDocumentException e) {
+            // Good
+        }
+
+        // Try again, this time with the password
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "tika";
+            }
+        });
+        XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+        String content = r.xml;
+        assertContains("This is an Encrypted Excel spreadsheet", content);
+        assertNotContained("9.0", content);
+
+    }
+
+    /**
+     * TIKA-214 - Ensure we extract labels etc from Charts
+     */
+    @Test
+    public void testExcelParserCharts() throws Exception {
+
+        XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = r.xml;
+
+        // The first sheet has a pie chart
+        assertContains("charttabyodawg", content);
+        assertContains("WhamPuff", content);
+
+        // The second sheet has a bar chart and some text
+        assertContains("Sheet1", content);
+        assertContains("Test Excel Spreasheet", content);
+        assertContains("foo", content);
+        assertContains("bar", content);
+        assertContains("fizzlepuff", content);
+        assertContains("whyaxis", content);
+        assertContains("eksaxis", content);
+
+        // The third sheet has some text
+        assertContains("Sheet2", content);
+        assertContains("dingdong", content);
+
+    }
+
+    @Test
+    public void testJXL() throws Exception {
+
+        XMLResult r = getXML("jxl.xls", new OfficeParser());
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("Number Formats", r.xml);
+
+    }
+
+    @Test
+    public void testWorksSpreadsheet70() throws Exception {
+        assertContains("Microsoft Works",
+                getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
+    }
+
+    /**
+     * We don't currently support the .xlsb file format
+     * (an OOXML container with binary blobs), but we
+     * shouldn't break on these files either (TIKA-826)
+     */
+    @Test
+    public void testExcelXLSB() throws Exception {
+        Detector detector = new DefaultDetector();
+        AutoDetectParser parser = new AutoDetectParser();
+
+        Metadata m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+        // Should be detected correctly
+        MediaType type;
+        try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+        }
+
+        // OfficeParser won't handle it
+        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // OOXMLParser won't handle it
+        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // AutoDetectParser doesn't break on it
+        assertContains("<body />", getXML("testEXCEL.xlsb").xml);
+
+    }
+
+    /**
+     * Excel 5 and 95 are older formats, and only get basic support
+     */
+    @Test
+    public void testExcel95() throws Exception {
+        Detector detector = new DefaultDetector();
+        AutoDetectParser parser = new AutoDetectParser();
+        MediaType type;
+        Metadata m;
+
+        // First try detection of Excel 5
+        m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel", type.toString());
+        }
+
+        // Now Excel 95
+        m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel", type.toString());
+        }
+
+        // OfficeParser can handle it
+        assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // OOXMLParser won't handle it
+        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+
+        // Parse the Excel 5 file
+        m = new Metadata();
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            parser.parse(input, handler, m, context);
+
+            String content = handler.toString();
+
+            // Sheet names
+            assertContains("Feuil1", content);
+            assertContains("Feuil3", content);
+
+            // Text
+            assertContains("Sample Excel", content);
+            assertContains("Number", content);
+
+            // Numbers
+            assertContains("15", content);
+            assertContains("225", content);
+
+            // Metadata was also fetched
+            assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
+        }
+
+        // Parse the Excel 95 file
+        m = new Metadata();
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            parser.parse(input, handler, m, context);
+
+            String content = handler.toString();
+
+            // Sheet name
+            assertContains("Foglio1", content);
+
+            // Very boring file, no actual text or numbers!
+
+            // Metadata was also fetched
+            assertEquals(null, m.get(TikaCoreProperties.TITLE));
+            assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
+        }
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+
+        XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+        Metadata metadata = r.metadata;
+        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    @Test
+    public void testHeaderAndFooterExtraction() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.UK);
+
+        XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
+                new Metadata(), context);
+
+        Metadata metadata = r.metadata;
+        assertEquals(
+                "application/vnd.ms-excel",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+        String content = r.xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertContains("Header - Corporate Spreadsheet", content);
+        assertContains("Header - For Internal Use Only", content);
+        assertContains("Header - Author: John Smith", content);
+        assertContains("Footer - Corporate Spreadsheet", content);
+        assertContains("Footer - For Internal Use Only", content);
+        assertContains("Footer - Author: John Smith", content);
+
+    }
+
+    @Test
+    public void testHyperlinksInXLS() throws Exception {
+        String xml = getXML("testEXCEL_hyperlinks.xls").xml;
+        //external url
+        assertContains("<a href=\"http://tika.apache.org/\">", xml);
+        //mail url
+        assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
+        //external linked file
+        assertContains("<a href=\"linked_file.txt.htm\">", xml);
+
+        //TODO: not extracting these yet
+        //link on textbox
+//        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
+    }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 07644dd..beffee6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
-import org.junit.Test;
-
-
-public class OfficeParserTest extends TikaTest {
-
-    @Test
-    public void parseOfficeWord() throws Exception {
-        Metadata metadata = new Metadata();
-        Parser parser = new OfficeParser();
-
-        String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
-
-        assertTrue(xml.contains("test"));
-    }
-
-    private InputStream getTestDocument(String name) {
-        return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+import org.junit.Test;
+
+
+public class OfficeParserTest extends TikaTest {
+
+    @Test
+    public void parseOfficeWord() throws Exception {
+        Metadata metadata = new Metadata();
+        Parser parser = new OfficeParser();
+
+        String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
+
+        assertTrue(xml.contains("test"));
+    }
+
+    private InputStream getTestDocument(String name) {
+        return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index fbf8114..8662e65 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -1,239 +1,239 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing Outlook files.
- */
-public class OutlookParserTest extends TikaTest {
-
-    @Test
-    public void testOutlookParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(
-                "Microsoft Outlook Express 6",
-                metadata.get(TikaCoreProperties.TITLE));
-        assertEquals(
-                "Nouvel utilisateur de Outlook Express",
-                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
-        assertEquals(
-                "L'\u00C9quipe Microsoft Outlook Express",
-                metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals(
-                "L'\u00C9quipe Microsoft Outlook Express",
-                metadata.get(Metadata.AUTHOR));
-
-        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
-        assertEquals(
-                "2007-04-05T16:26:06Z",
-                metadata.get(TikaCoreProperties.CREATED));
-
-        String content = handler.toString();
-        assertContains("Microsoft Outlook Express 6", content);
-        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
-        assertContains("Nouvel utilisateur de Outlook Express", content);
-        assertContains("Messagerie et groupes de discussion", content);
-    }
-
-    /**
-     * Test case for TIKA-197
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
-     */
-    @Test
-    public void testMultipleCopies() throws Exception {
-        Parser parser = new AutoDetectParser();
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = handler.toString();
-        Pattern pattern = Pattern.compile("From");
-        Matcher matcher = pattern.matcher(content);
-        assertTrue(matcher.find());
-        assertFalse(matcher.find());
-    }
-
-    /**
-     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
-     */
-    @Test
-    public void testOutlookNew() throws Exception {
-        Parser parser = new AutoDetectParser();
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook2003.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(
-                "Welcome to Microsoft Office Outlook 2003",
-                metadata.get(TikaCoreProperties.TITLE));
-
-        String content = handler.toString();
-        assertContains("Outlook 2003", content);
-        assertContains("Streamlined Mail Experience", content);
-        assertContains("Navigation Pane", content);
-    }
-
-    @Test
-    public void testOutlookHTMLVersion() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG_chinese.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // As the HTML version should have been processed, ensure
-        //  we got some of the links
-        String content = sw.toString();
-        assertContains("<dd>tests.chang@fengttt.com</dd>", content);
-        assertContains("<p>Alfresco MSG format testing", content);
-        assertContains("<li>1", content);
-        assertContains("<li>2", content);
-
-        // Make sure we don't have nested html docs
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-
-        // Make sure that the Chinese actually came through
-        assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
-        assertContains("\u9673\u60E0\u73CD", content);
-    }
-
-    @Test
-    public void testOutlookForwarded() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG_forwarded.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // Make sure we don't have nested docs
-        String content = sw.toString();
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-    }
-
-    @Test
-    public void testOutlookHTMLfromRTF() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook2003.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // As the HTML version should have been processed, ensure
-        //  we got some of the links
-        String content = sw.toString().replaceAll("<p>\\s+", "<p>");
-        assertContains("<dd>New Outlook User</dd>", content);
-        assertContains("designed <i>to help you", content);
-        assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
-
-        // Link - check text around it, and the link itself
-        assertContains("sign up for a free subscription", content);
-        assertContains("Office Newsletter", content);
-        assertContains("newsletter will be sent to you", content);
-        assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
-
-        // Make sure we don't have nested html docs
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TikaTest {
+
+    @Test
+    public void testOutlookParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Microsoft Outlook Express 6",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(
+                "Nouvel utilisateur de Outlook Express",
+                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(Metadata.AUTHOR));
+
+        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+        assertEquals(
+                "2007-04-05T16:26:06Z",
+                metadata.get(TikaCoreProperties.CREATED));
+
+        String content = handler.toString();
+        assertContains("Microsoft Outlook Express 6", content);
+        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
+        assertContains("Nouvel utilisateur de Outlook Express", content);
+        assertContains("Messagerie et groupes de discussion", content);
+    }
+
+    /**
+     * Test case for TIKA-197
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+     */
+    @Test
+    public void testMultipleCopies() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = handler.toString();
+        Pattern pattern = Pattern.compile("From");
+        Matcher matcher = pattern.matcher(content);
+        assertTrue(matcher.find());
+        assertFalse(matcher.find());
+    }
+
+    /**
+     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
+     */
+    @Test
+    public void testOutlookNew() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Welcome to Microsoft Office Outlook 2003",
+                metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("Outlook 2003", content);
+        assertContains("Streamlined Mail Experience", content);
+        assertContains("Navigation Pane", content);
+    }
+
+    @Test
+    public void testOutlookHTMLVersion() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_chinese.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString();
+        assertContains("<dd>tests.chang@fengttt.com</dd>", content);
+        assertContains("<p>Alfresco MSG format testing", content);
+        assertContains("<li>1", content);
+        assertContains("<li>2", content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+
+        // Make sure that the Chinese actually came through
+        assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
+        assertContains("\u9673\u60E0\u73CD", content);
+    }
+
+    @Test
+    public void testOutlookForwarded() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_forwarded.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Make sure we don't have nested docs
+        String content = sw.toString();
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+
+    @Test
+    public void testOutlookHTMLfromRTF() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString().replaceAll("<p>\\s+", "<p>");
+        assertContains("<dd>New Outlook User</dd>", content);
+        assertContains("designed <i>to help you", content);
+        assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
+
+        // Link - check text around it, and the link itself
+        assertContains("sign up for a free subscription", content);
+        assertContains("Office Newsletter", content);
+        assertContains("newsletter will be sent to you", content);
+        assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+}

[09/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 77773e0..f9df9e0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,544 +1,544 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-
-
-/**
- * <code>CharsetDetector</code> provides a facility for detecting the
- * charset or encoding of character data in an unknown format.
- * The input data can either be from an input stream or an array of bytes.
- * The result of the detection operation is a list of possibly matching
- * charsets, or, for simple use, you can just ask for a Java Reader that
- * will will work over the input data.
- * <p/>
- * Character set detection is at best an imprecise operation.  The detection
- * process will attempt to identify the charset that best matches the characteristics
- * of the byte data, but the process is partly statistical in nature, and
- * the results can not be guaranteed to always be correct.
- * <p/>
- * For best accuracy in charset detection, the input data should be primarily
- * in a single language, and a minimum of a few hundred bytes worth of plain text
- * in the language are needed.  The detection process will attempt to
- * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
- * @stable ICU 3.4
- */
-public class CharsetDetector {
-
-//   Question: Should we have getters corresponding to the setters for input text
-//   and declared encoding?
-
-//   A thought: If we were to create our own type of Java Reader, we could defer
-//   figuring out an actual charset for data that starts out with too much English
-//   only ASCII until the user actually read through to something that didn't look
-//   like 7 bit English.  If  nothing else ever appeared, we would never need to
-//   actually choose the "real" charset.  All assuming that the application just
-//   wants the data, and doesn't care about a char set name.
-
-    private static final int kBufSize = 12000;
-    private static final int MAX_CONFIDENCE = 100;
-    private static String[] fCharsetNames;
-    /*
-     * List of recognizers for all charsets known to the implementation.
-     */
-    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
-    /*
-     *  The following items are accessed by individual CharsetRecongizers during
-     *     the recognition process
-     *
-     */
-    byte[] fInputBytes =       // The text to be checked.  Markup will have been
-            new byte[kBufSize];  //   removed if appropriate.
-    int fInputLen;          // Length of the byte data in fInputText.
-    short fByteStats[] =      // byte frequency statistics for the input text.
-            new short[256];  //   Value is percent, not absolute.
-    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
-            false;
-    String fDeclaredEncoding;
-    //
-    //  Stuff private to CharsetDetector
-    //
-    byte[] fRawInput;     // Original, untouched input bytes.
-    //  If user gave us a byte array, this is it.
-    //  If user gave us a stream, it's read to a
-    //  buffer here.
-    int fRawLength;    // Length of data in fRawInput array.
-    InputStream fInputStream;  // User's input stream, or null if the user
-    boolean fStripTags =   // If true, setText() will strip tags from input text.
-            false;
-
-    /**
-     *   Constructor
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector() {
-    }
-
-    /**
-     * Get the names of all char sets that can be recognized by the char set detector.
-     *
-     * @return an array of the names of all charsets that can be recognized
-     * by the charset detector.
-     *
-     * @stable ICU 3.4
-     */
-    public static String[] getAllDetectableCharsets() {
-        return fCharsetNames;
-    }
-
-    /*
-     * Create the singleton instances of the CharsetRecognizer classes
-     */
-    private static ArrayList<CharsetRecognizer> createRecognizers() {
-        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
-        recognizers.add(new CharsetRecog_UTF8());
-
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
-        // Create an array of all charset names, as a side effect.
-        // Needed for the getAllDetectableCharsets() API.
-        String[] charsetNames = new String[recognizers.size()];
-        int out = 0;
-
-        for (CharsetRecognizer recognizer : recognizers) {
-            String name = recognizer.getName();
-
-            if (out == 0 || !name.equals(charsetNames[out - 1])) {
-                charsetNames[out++] = name;
-            }
-        }
-
-        fCharsetNames = new String[out];
-        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
-        return recognizers;
-    }
-
-    /**
-     * Set the declared encoding for charset detection.
-     *  The declared encoding of an input text is an encoding obtained
-     *  from an http header or xml declaration or similar source that
-     *  can be provided as additional information to the charset detector.
-     *  A match between a declared encoding and a possible detected encoding
-     *  will raise the quality of that detected encoding by a small delta,
-     *  and will also appear as a "reason" for the match.
-     * <p/>
-     * A declared encoding that is incompatible with the input data being
-     * analyzed will not be added to the list of possible encodings.
-     *
-     *  @param encoding The declared encoding
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector setDeclaredEncoding(String encoding) {
-        setCanonicalDeclaredEncoding(encoding);
-        return this;
-    }
-
-    /**
-     * Set the input text (byte) data whose charset is to be detected.
-     *
-     * @param in the input text of unknown encoding
-     *
-     * @return This CharsetDetector
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector setText(byte[] in) {
-        fRawInput = in;
-        fRawLength = in.length;
-
-        MungeInput();
-
-        return this;
-    }
-    //   Value is rounded up, so zero really means zero occurences.
-
-    /**
-     * Set the input text (byte) data whose charset is to be detected.
-     *  <p/>
-     *   The input stream that supplies the character data must have markSupported()
-     *   == true; the charset detection process will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *   be read depends on the characteristics of the data itself.
-     *
-     * @param in the input text of unknown encoding
-     *
-     * @return This CharsetDetector
-     *
-     * @stable ICU 3.4
-     */
-
-    public CharsetDetector setText(InputStream in) throws IOException {
-        fInputStream = in;
-        fInputStream.mark(kBufSize);
-        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
-        //   previous one may have come from the caller,
-        //   in which case we can't touch it.
-        fRawLength = 0;
-        int remainingLength = kBufSize;
-        while (remainingLength > 0) {
-            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
-            int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
-            if (bytesRead <= 0) {
-                break;
-            }
-            fRawLength += bytesRead;
-            remainingLength -= bytesRead;
-        }
-        fInputStream.reset();
-
-        MungeInput();                     // Strip html markup, collect byte stats.
-        return this;
-    }
-
-    /**
-     * Return the charset that best matches the supplied input data.
-     *
-     * Note though, that because the detection
-     * only looks at the start of the input data,
-     * there is a possibility that the returned charset will fail to handle
-     * the full set of input data.
-     * <p/>
-     * Raise an exception if
-     *  <ul>
-     *    <li>no charset appears to match the data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
-     *
-     * @return a CharsetMatch object representing the best matching charset, or
-     *         <code>null</code> if there are no matches.
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetMatch detect() {
-//   TODO:  A better implementation would be to copy the detect loop from
-//          detectAll(), and cut it short as soon as a match with a high confidence
-//          is found.  This is something to be done later, after things are otherwise
-//          working.
-        CharsetMatch matches[] = detectAll();
-
-        if (matches == null || matches.length == 0) {
-            return null;
-        }
-
-        return matches[0];
-    }
-
-    /**
-     *  Return an array of all charsets that appear to be plausible
-     *  matches with the input data.  The array is ordered with the
-     *  best quality match first.
-     * <p/>
-     * Raise an exception if
-     *  <ul>
-     *    <li>no charsets appear to match the input data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
-     *
-     * @return An array of CharsetMatch objects representing possibly matching charsets.
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetMatch[] detectAll() {
-        CharsetRecognizer csr;
-        int i;
-        int detectResults;
-        int confidence;
-        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
-        //  Iterate over all possible charsets, remember all that
-        //    give a match quality > 0.
-        for (i = 0; i < fCSRecognizers.size(); i++) {
-            csr = fCSRecognizers.get(i);
-            detectResults = csr.match(this);
-            confidence = detectResults & 0x000000ff;
-            if (confidence > 0) {
-                // Just to be safe, constrain
-                confidence = Math.min(confidence, MAX_CONFIDENCE);
-
-                // Apply charset hint.
-                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
-                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
-                    confidence += (MAX_CONFIDENCE - confidence) / 2;
-                }
-
-                CharsetMatch m = new CharsetMatch(this, csr, confidence);
-                matches.add(m);
-            }
-        }
-
-        Collections.sort(matches);      // CharsetMatch compares on confidence
-        Collections.reverse(matches);   //  Put best match first.
-        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
-        resultArray = matches.toArray(resultArray);
-        return resultArray;
-    }
-
-    /**
-     * Autodetect the charset of an inputStream, and return a Java Reader
-     * to access the converted input data.
-     * <p/>
-     * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
-     * <p/>
-     *   For the input stream that supplies the character data, markSupported()
-     *   must be true; the  charset detection will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *    be read depends on the characteristics of the data itself.
-     *<p/>
-     * Raise an exception if no charsets appear to match the input data.
-     *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
-     * @stable ICU 3.4
-     */
-    public Reader getReader(InputStream in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
-
-        try {
-            setText(in);
-
-            CharsetMatch match = detect();
-
-            if (match == null) {
-                return null;
-            }
-
-            return match.getReader();
-        } catch (IOException e) {
-            return null;
-        }
-    }
-
-    /**
-     * Autodetect the charset of an inputStream, and return a String
-     * containing the converted input data.
-     * <p/>
-     * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
-     *<p/>
-     * Raise an exception if no charsets appear to match the input data.
-     *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString(byte[] in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
-
-        try {
-            setText(in);
-
-            CharsetMatch match = detect();
-
-            if (match == null) {
-                return null;
-            }
-
-            return match.getString(-1);
-        } catch (IOException e) {
-            return null;
-        }
-    }
-    //   gave us a byte array.
-
-    /**
-     * Test whether or not input filtering is enabled.
-     *
-     * @return <code>true</code> if input text will be filtered.
-     *
-     * @see #enableInputFilter
-     *
-     * @stable ICU 3.4
-     */
-    public boolean inputFilterEnabled() {
-        return fStripTags;
-    }
-
-    /**
-     * Enable filtering of input text. If filtering is enabled,
-     * text within angle brackets ("<" and ">") will be removed
-     * before detection.
-     *
-     * @param filter <code>true</code> to enable input text filtering.
-     *
-     * @return The previous setting.
-     *
-     * @stable ICU 3.4
-     */
-    public boolean enableInputFilter(boolean filter) {
-        boolean previous = fStripTags;
-
-        fStripTags = filter;
-
-        return previous;
-    }
-
-    /**
-     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
-     *
-     * @param encoding - name of character encoding
-     */
-    private void setCanonicalDeclaredEncoding(String encoding) {
-        if ((encoding == null) || encoding.isEmpty()) {
-            return;
-        }
-
-        Charset cs = Charset.forName(encoding);
-        if (cs != null) {
-            fDeclaredEncoding = cs.name();
-        }
-    }
-
-    /*
-     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
-     *               it by removing what appears to be html markup.
-     */
-    private void MungeInput() {
-        int srci = 0;
-        int dsti = 0;
-        byte b;
-        boolean inMarkup = false;
-        int openTags = 0;
-        int badTags = 0;
-
-        //
-        //  html / xml markup stripping.
-        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
-        //     discard everything within < brackets >
-        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
-        //     guess as to whether the input was actually marked up at all.
-        if (fStripTags) {
-            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
-                b = fRawInput[srci];
-                if (b == (byte) '<') {
-                    if (inMarkup) {
-                        badTags++;
-                    }
-                    inMarkup = true;
-                    openTags++;
-                }
-
-                if (!inMarkup) {
-                    fInputBytes[dsti++] = b;
-                }
-
-                if (b == (byte) '>') {
-                    inMarkup = false;
-                }
-            }
-
-            fInputLen = dsti;
-        }
-
-        //
-        //  If it looks like this input wasn't marked up, or if it looks like it's
-        //    essentially nothing but markup abandon the markup stripping.
-        //    Detection will have to work on the unstripped input.
-        //
-        if (openTags < 5 || openTags / 5 < badTags ||
-                (fInputLen < 100 && fRawLength > 600)) {
-            int limit = fRawLength;
-
-            if (limit > kBufSize) {
-                limit = kBufSize;
-            }
-
-            for (srci = 0; srci < limit; srci++) {
-                fInputBytes[srci] = fRawInput[srci];
-            }
-            fInputLen = srci;
-        }
-
-        //
-        // Tally up the byte occurence statistics.
-        //   These are available for use by the various detectors.
-        //
-        Arrays.fill(fByteStats, (short) 0);
-        for (srci = 0; srci < fInputLen; srci++) {
-            int val = fInputBytes[srci] & 0x00ff;
-            fByteStats[val]++;
-        }
-
-        fC1Bytes = false;
-        for (int i = 0x80; i <= 0x9F; i += 1) {
-            if (fByteStats[i] != 0) {
-                fC1Bytes = true;
-                break;
-            }
-        }
-    }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation.  The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed.  The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+//   Question: Should we have getters corresponding to the setters for input text
+//   and declared encoding?
+
+//   A thought: If we were to create our own type of Java Reader, we could defer
+//   figuring out an actual charset for data that starts out with too much English
+//   only ASCII until the user actually read through to something that didn't look
+//   like 7 bit English.  If  nothing else ever appeared, we would never need to
+//   actually choose the "real" charset.  All assuming that the application just
+//   wants the data, and doesn't care about a char set name.
+
+    private static final int kBufSize = 12000;
+    private static final int MAX_CONFIDENCE = 100;
+    private static String[] fCharsetNames;
+    /*
+     * List of recognizers for all charsets known to the implementation.
+     */
+    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+    /*
+     *  The following items are accessed by individual CharsetRecongizers during
+     *     the recognition process
+     *
+     */
+    byte[] fInputBytes =       // The text to be checked.  Markup will have been
+            new byte[kBufSize];  //   removed if appropriate.
+    int fInputLen;          // Length of the byte data in fInputText.
+    short fByteStats[] =      // byte frequency statistics for the input text.
+            new short[256];  //   Value is percent, not absolute.
+    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
+            false;
+    String fDeclaredEncoding;
+    //
+    //  Stuff private to CharsetDetector
+    //
+    byte[] fRawInput;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    //  If user gave us a stream, it's read to a
+    //  buffer here.
+    int fRawLength;    // Length of data in fRawInput array.
+    InputStream fInputStream;  // User's input stream, or null if the user
+    boolean fStripTags =   // If true, setText() will strip tags from input text.
+            false;
+
+    /**
+     *   Constructor
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector() {
+    }
+
+    /**
+     * Get the names of all char sets that can be recognized by the char set detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     *
+     * @stable ICU 3.4
+     */
+    public static String[] getAllDetectableCharsets() {
+        return fCharsetNames;
+    }
+
+    /*
+     * Create the singleton instances of the CharsetRecognizer classes
+     */
+    private static ArrayList<CharsetRecognizer> createRecognizers() {
+        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
+
+        recognizers.add(new CharsetRecog_UTF8());
+
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        String[] charsetNames = new String[recognizers.size()];
+        int out = 0;
+
+        for (CharsetRecognizer recognizer : recognizers) {
+            String name = recognizer.getName();
+
+            if (out == 0 || !name.equals(charsetNames[out - 1])) {
+                charsetNames[out++] = name;
+            }
+        }
+
+        fCharsetNames = new String[out];
+        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+        return recognizers;
+    }
+
+    /**
+     * Set the declared encoding for charset detection.
+     *  The declared encoding of an input text is an encoding obtained
+     *  from an http header or xml declaration or similar source that
+     *  can be provided as additional information to the charset detector.
+     *  A match between a declared encoding and a possible detected encoding
+     *  will raise the quality of that detected encoding by a small delta,
+     *  and will also appear as a "reason" for the match.
+     * <p/>
+     * A declared encoding that is incompatible with the input data being
+     * analyzed will not be added to the list of possible encodings.
+     *
+     *  @param encoding The declared encoding
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setDeclaredEncoding(String encoding) {
+        setCanonicalDeclaredEncoding(encoding);
+        return this;
+    }
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setText(byte[] in) {
+        fRawInput = in;
+        fRawLength = in.length;
+
+        MungeInput();
+
+        return this;
+    }
+    //   Value is rounded up, so zero really means zero occurences.
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *  <p/>
+     *   The input stream that supplies the character data must have markSupported()
+     *   == true; the charset detection process will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *   be read depends on the characteristics of the data itself.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+
+    public CharsetDetector setText(InputStream in) throws IOException {
+        fInputStream = in;
+        fInputStream.mark(kBufSize);
+        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
+        //   previous one may have come from the caller,
+        //   in which case we can't touch it.
+        fRawLength = 0;
+        int remainingLength = kBufSize;
+        while (remainingLength > 0) {
+            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
+            int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+            if (bytesRead <= 0) {
+                break;
+            }
+            fRawLength += bytesRead;
+            remainingLength -= bytesRead;
+        }
+        fInputStream.reset();
+
+        MungeInput();                     // Strip html markup, collect byte stats.
+        return this;
+    }
+
+    /**
+     * Return the charset that best matches the supplied input data.
+     *
+     * Note though, that because the detection
+     * only looks at the start of the input data,
+     * there is a possibility that the returned charset will fail to handle
+     * the full set of input data.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charset appears to match the data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return a CharsetMatch object representing the best matching charset, or
+     *         <code>null</code> if there are no matches.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch detect() {
+//   TODO:  A better implementation would be to copy the detect loop from
+//          detectAll(), and cut it short as soon as a match with a high confidence
+//          is found.  This is something to be done later, after things are otherwise
+//          working.
+        CharsetMatch matches[] = detectAll();
+
+        if (matches == null || matches.length == 0) {
+            return null;
+        }
+
+        return matches[0];
+    }
+
+    /**
+     *  Return an array of all charsets that appear to be plausible
+     *  matches with the input data.  The array is ordered with the
+     *  best quality match first.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charsets appear to match the input data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return An array of CharsetMatch objects representing possibly matching charsets.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch[] detectAll() {
+        CharsetRecognizer csr;
+        int i;
+        int detectResults;
+        int confidence;
+        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+        //  Iterate over all possible charsets, remember all that
+        //    give a match quality > 0.
+        for (i = 0; i < fCSRecognizers.size(); i++) {
+            csr = fCSRecognizers.get(i);
+            detectResults = csr.match(this);
+            confidence = detectResults & 0x000000ff;
+            if (confidence > 0) {
+                // Just to be safe, constrain
+                confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+                // Apply charset hint.
+                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
+                    confidence += (MAX_CONFIDENCE - confidence) / 2;
+                }
+
+                CharsetMatch m = new CharsetMatch(this, csr, confidence);
+                matches.add(m);
+            }
+        }
+
+        Collections.sort(matches);      // CharsetMatch compares on confidence
+        Collections.reverse(matches);   //  Put best match first.
+        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+        resultArray = matches.toArray(resultArray);
+        return resultArray;
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a Java Reader
+     * to access the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p/>
+     *   For the input stream that supplies the character data, markSupported()
+     *   must be true; the  charset detection will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *    be read depends on the characteristics of the data itself.
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public Reader getReader(InputStream in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getReader();
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a String
+     * containing the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString(byte[] in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getString(-1);
+        } catch (IOException e) {
+            return null;
+        }
+    }
+    //   gave us a byte array.
+
+    /**
+     * Test whether or not input filtering is enabled.
+     *
+     * @return <code>true</code> if input text will be filtered.
+     *
+     * @see #enableInputFilter
+     *
+     * @stable ICU 3.4
+     */
+    public boolean inputFilterEnabled() {
+        return fStripTags;
+    }
+
+    /**
+     * Enable filtering of input text. If filtering is enabled,
+     * text within angle brackets ("<" and ">") will be removed
+     * before detection.
+     *
+     * @param filter <code>true</code> to enable input text filtering.
+     *
+     * @return The previous setting.
+     *
+     * @stable ICU 3.4
+     */
+    public boolean enableInputFilter(boolean filter) {
+        boolean previous = fStripTags;
+
+        fStripTags = filter;
+
+        return previous;
+    }
+
+    /**
+     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
+     *
+     * @param encoding - name of character encoding
+     */
+    private void setCanonicalDeclaredEncoding(String encoding) {
+        if ((encoding == null) || encoding.isEmpty()) {
+            return;
+        }
+
+        Charset cs = Charset.forName(encoding);
+        if (cs != null) {
+            fDeclaredEncoding = cs.name();
+        }
+    }
+
+    /*
+     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
+     *               it by removing what appears to be html markup.
+     */
+    private void MungeInput() {
+        int srci = 0;
+        int dsti = 0;
+        byte b;
+        boolean inMarkup = false;
+        int openTags = 0;
+        int badTags = 0;
+
+        //
+        //  html / xml markup stripping.
+        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+        //     discard everything within < brackets >
+        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
+        //     guess as to whether the input was actually marked up at all.
+        if (fStripTags) {
+            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
+                b = fRawInput[srci];
+                if (b == (byte) '<') {
+                    if (inMarkup) {
+                        badTags++;
+                    }
+                    inMarkup = true;
+                    openTags++;
+                }
+
+                if (!inMarkup) {
+                    fInputBytes[dsti++] = b;
+                }
+
+                if (b == (byte) '>') {
+                    inMarkup = false;
+                }
+            }
+
+            fInputLen = dsti;
+        }
+
+        //
+        //  If it looks like this input wasn't marked up, or if it looks like it's
+        //    essentially nothing but markup abandon the markup stripping.
+        //    Detection will have to work on the unstripped input.
+        //
+        if (openTags < 5 || openTags / 5 < badTags ||
+                (fInputLen < 100 && fRawLength > 600)) {
+            int limit = fRawLength;
+
+            if (limit > kBufSize) {
+                limit = kBufSize;
+            }
+
+            for (srci = 0; srci < limit; srci++) {
+                fInputBytes[srci] = fRawInput[srci];
+            }
+            fInputLen = srci;
+        }
+
+        //
+        // Tally up the byte occurence statistics.
+        //   These are available for use by the various detectors.
+        //
+        Arrays.fill(fByteStats, (short) 0);
+        for (srci = 0; srci < fInputLen; srci++) {
+            int val = fInputBytes[srci] & 0x00ff;
+            fByteStats[val]++;
+        }
+
+        fC1Bytes = false;
+        for (int i = 0x80; i <= 0x9F; i += 1) {
+            if (fByteStats[i] != 0) {
+                fC1Bytes = true;
+                break;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 9244cd9..22219ab 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,286 +1,286 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-
-
-/**
- * This class represents a charset that has been identified by a CharsetDetector
- * as a possible encoding for a set of input data.  From an instance of this
- * class, you can ask for a confidence level in the charset identification,
- * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
- * Instances of this class are created only by CharsetDetectors.
- * <p/>
- * Note:  this class has a natural ordering that is inconsistent with equals.
- *        The natural ordering is based on the match confidence value.
- *
- * @stable ICU 3.4
- */
-public class CharsetMatch implements Comparable<CharsetMatch> {
-
-
-    /**
-     * Bit flag indicating the match is based on the the encoding scheme.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int ENCODING_SCHEME = 1;
-    /**
-     * Bit flag indicating the match is based on the presence of a BOM.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int BOM = 2;
-    /**
-     * Bit flag indicating he match is based on the declared encoding.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int DECLARED_ENCODING = 4;
-    /**
-     * Bit flag indicating the match is based on language statistics.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int LANG_STATISTICS = 8;
-    //
-    //   Private Data
-    //
-    private int fConfidence;
-    private CharsetRecognizer fRecognizer;
-    private byte[] fRawInput = null;     // Original, untouched input bytes.
-    //  If user gave us a byte array, this is it.
-    private int fRawLength;           // Length of data in fRawInput array.
-    private InputStream fInputStream = null;  // User's input stream, or null if the user
-
-    /*
-     *  Constructor.  Implementation internal
-     */
-    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
-        fRecognizer = rec;
-        fConfidence = conf;
-
-        // The references to the original aplication input data must be copied out
-        //   of the charset recognizer to here, in case the application resets the
-        //   recognizer before using this CharsetMatch.
-        if (det.fInputStream == null) {
-            // We only want the existing input byte data if it came straight from the user,
-            //   not if is just the head of a stream.
-            fRawInput = det.fRawInput;
-            fRawLength = det.fRawLength;
-        }
-        fInputStream = det.fInputStream;
-    }
-
-    /**
-     * Create a java.io.Reader for reading the Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     * <p/>
-     * CAUTION:  if the source of the byte data was an InputStream, a Reader
-     * can be created for only one matching char set using this method.  If more
-     * than one charset needs to be tried, the caller will need to reset
-     * the InputStream and create InputStreamReaders itself, based on the charset name.
-     *
-     * @return the Reader for the Unicode character data.
-     *
-     * @stable ICU 3.4
-     */
-    public Reader getReader() {
-        InputStream inputStream = fInputStream;
-
-        if (inputStream == null) {
-            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
-        }
-
-        try {
-            inputStream.reset();
-            return new InputStreamReader(inputStream, getName());
-        } catch (IOException e) {
-            return null;
-        }
-    }
-
-    /**
-     * Create a Java String from Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     *
-     * @return a String created from the converted input data.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString() throws java.io.IOException {
-        return getString(-1);
-
-    }
-
-    /**
-     * Create a Java String from Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     * The length of the returned string is limited to the specified size;
-     * the string will be trunctated to this length if necessary.  A limit value of
-     * zero or less is ignored, and treated as no limit.
-     *
-     * @param maxLength The maximium length of the String to be created when the
-     *                  source of the data is an input stream, or -1 for
-     *                  unlimited length.
-     * @return a String created from the converted input data.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString(int maxLength) throws java.io.IOException {
-        String result = null;
-        if (fInputStream != null) {
-            StringBuffer sb = new StringBuffer();
-            char[] buffer = new char[1024];
-            Reader reader = getReader();
-            int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
-            int bytesRead = 0;
-
-            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
-                sb.append(buffer, 0, bytesRead);
-                max -= bytesRead;
-            }
-
-            reader.close();
-
-            return sb.toString();
-        } else {
-            result = new String(fRawInput, getName());
-        }
-        return result;
-
-    }
-
-    /**
-     * Get an indication of the confidence in the charset detected.
-     * Confidence values range from 0-100, with larger numbers indicating
-     * a better match of the input data to the characteristics of the
-     * charset.
-     *
-     * @return the confidence in the charset match
-     *
-     * @stable ICU 3.4
-     */
-    public int getConfidence() {
-        return fConfidence;
-    }
-
-    /**
-     * Return flags indicating what it was about the input data
-     * that caused this charset to be considered as a possible match.
-     * The result is a bitfield containing zero or more of the flags
-     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
-     * A result of zero means no information is available.
-     * <p>
-     * Note: currently, this method always returns zero.
-     * <p>
-     *
-     * @return the type of match found for this charset.
-     *
-     * @draft ICU 3.4
-     * @provisional This API might change or be removed in a future release.
-     */
-    public int getMatchType() {
-//      TODO: create a list of enum-like constants for common combinations of types of matches.
-        return 0;
-    }
-
-    /**
-     * Get the name of the detected charset.
-     * The name will be one that can be used with other APIs on the
-     * platform that accept charset names.  It is the "Canonical name"
-     * as defined by the class java.nio.charset.Charset; for
-     * charsets that are registered with the IANA charset registry,
-     * this is the MIME-preferred registerd name.
-     *
-     * @see java.nio.charset.Charset
-     * @see java.io.InputStreamReader
-     *
-     * @return The name of the charset.
-     *
-     * @stable ICU 3.4
-     */
-    public String getName() {
-        return fRecognizer.getName();
-    }
-
-    /**
-     * Get the ISO code for the language of the detected charset.
-     *
-     * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
-     *
-     * @stable ICU 3.4
-     */
-    public String getLanguage() {
-        return fRecognizer.getLanguage();
-    }
-
-    /**
-     * Compare to other CharsetMatch objects.
-     * Comparison is based on the match confidence value, which
-     *   allows CharsetDetector.detectAll() to order its results.
-     *
-     * @param o the CharsetMatch object to compare against.
-     * @return a negative integer, zero, or a positive integer as the
-     *          confidence level of this CharsetMatch
-     *          is less than, equal to, or greater than that of
-     *          the argument.
-     * @throws ClassCastException if the argument is not a CharsetMatch.
-     * @stable ICU 3.4
-     */
-    public int compareTo(CharsetMatch other) {
-        int compareResult = 0;
-        if (this.fConfidence > other.fConfidence) {
-            compareResult = 1;
-        } else if (this.fConfidence < other.fConfidence) {
-            compareResult = -1;
-        }
-        return compareResult;
-    }
-
-    /**
-     * compare this CharsetMatch to another based on confidence value
-     * @param o the CharsetMatch object to compare against
-     * @return true if equal
-     */
-    public boolean equals(Object o) {
-        if (o instanceof CharsetMatch) {
-            CharsetMatch that = (CharsetMatch) o;
-            return (this.fConfidence == that.fConfidence);
-        }
-
-        return false;
-    }
-
-    /**
-     * generates a hashCode based on the confidence value
-     * @return the hashCode
-     */
-    public int hashCode() {
-        return fConfidence;
-    }
-    //   gave us a byte array.
-
-    public String toString() {
-        String s = "Match of " + fRecognizer.getName();
-        if (fRecognizer.getLanguage() != null) {
-            s += " in " + fRecognizer.getLanguage();
-        }
-        s += " with confidence " + fConfidence;
-        return s;
-    }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2007, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data.  From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note:  this class has a natural ordering that is inconsistent with equals.
+ *        The natural ordering is based on the match confidence value.
+ *
+ * @stable ICU 3.4
+ */
+public class CharsetMatch implements Comparable<CharsetMatch> {
+
+
+    /**
+     * Bit flag indicating the match is based on the the encoding scheme.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int ENCODING_SCHEME = 1;
+    /**
+     * Bit flag indicating the match is based on the presence of a BOM.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int BOM = 2;
+    /**
+     * Bit flag indicating he match is based on the declared encoding.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int DECLARED_ENCODING = 4;
+    /**
+     * Bit flag indicating the match is based on language statistics.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int LANG_STATISTICS = 8;
+    //
+    //   Private Data
+    //
+    private int fConfidence;
+    private CharsetRecognizer fRecognizer;
+    private byte[] fRawInput = null;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    private int fRawLength;           // Length of data in fRawInput array.
+    private InputStream fInputStream = null;  // User's input stream, or null if the user
+
+    /*
+     *  Constructor.  Implementation internal
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+        fRecognizer = rec;
+        fConfidence = conf;
+
+        // The references to the original aplication input data must be copied out
+        //   of the charset recognizer to here, in case the application resets the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight from the user,
+            //   not if is just the head of a stream.
+            fRawInput = det.fRawInput;
+            fRawLength = det.fRawLength;
+        }
+        fInputStream = det.fInputStream;
+    }
+
+    /**
+     * Create a java.io.Reader for reading the Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     * <p/>
+     * CAUTION:  if the source of the byte data was an InputStream, a Reader
+     * can be created for only one matching char set using this method.  If more
+     * than one charset needs to be tried, the caller will need to reset
+     * the InputStream and create InputStreamReaders itself, based on the charset name.
+     *
+     * @return the Reader for the Unicode character data.
+     *
+     * @stable ICU 3.4
+     */
+    public Reader getReader() {
+        InputStream inputStream = fInputStream;
+
+        if (inputStream == null) {
+            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+        }
+
+        try {
+            inputStream.reset();
+            return new InputStreamReader(inputStream, getName());
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     *
+     * @return a String created from the converted input data.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString() throws java.io.IOException {
+        return getString(-1);
+
+    }
+
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     * The length of the returned string is limited to the specified size;
+     * the string will be trunctated to this length if necessary.  A limit value of
+     * zero or less is ignored, and treated as no limit.
+     *
+     * @param maxLength The maximium length of the String to be created when the
+     *                  source of the data is an input stream, or -1 for
+     *                  unlimited length.
+     * @return a String created from the converted input data.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString(int maxLength) throws java.io.IOException {
+        String result = null;
+        if (fInputStream != null) {
+            StringBuffer sb = new StringBuffer();
+            char[] buffer = new char[1024];
+            Reader reader = getReader();
+            int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
+            int bytesRead = 0;
+
+            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+                sb.append(buffer, 0, bytesRead);
+                max -= bytesRead;
+            }
+
+            reader.close();
+
+            return sb.toString();
+        } else {
+            result = new String(fRawInput, getName());
+        }
+        return result;
+
+    }
+
+    /**
+     * Get an indication of the confidence in the charset detected.
+     * Confidence values range from 0-100, with larger numbers indicating
+     * a better match of the input data to the characteristics of the
+     * charset.
+     *
+     * @return the confidence in the charset match
+     *
+     * @stable ICU 3.4
+     */
+    public int getConfidence() {
+        return fConfidence;
+    }
+
+    /**
+     * Return flags indicating what it was about the input data
+     * that caused this charset to be considered as a possible match.
+     * The result is a bitfield containing zero or more of the flags
+     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
+     * A result of zero means no information is available.
+     * <p>
+     * Note: currently, this method always returns zero.
+     * <p>
+     *
+     * @return the type of match found for this charset.
+     *
+     * @draft ICU 3.4
+     * @provisional This API might change or be removed in a future release.
+     */
+    public int getMatchType() {
+//      TODO: create a list of enum-like constants for common combinations of types of matches.
+        return 0;
+    }
+
+    /**
+     * Get the name of the detected charset.
+     * The name will be one that can be used with other APIs on the
+     * platform that accept charset names.  It is the "Canonical name"
+     * as defined by the class java.nio.charset.Charset; for
+     * charsets that are registered with the IANA charset registry,
+     * this is the MIME-preferred registerd name.
+     *
+     * @see java.nio.charset.Charset
+     * @see java.io.InputStreamReader
+     *
+     * @return The name of the charset.
+     *
+     * @stable ICU 3.4
+     */
+    public String getName() {
+        return fRecognizer.getName();
+    }
+
+    /**
+     * Get the ISO code for the language of the detected charset.
+     *
+     * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
+     *
+     * @stable ICU 3.4
+     */
+    public String getLanguage() {
+        return fRecognizer.getLanguage();
+    }
+
+    /**
+     * Compare to other CharsetMatch objects.
+     * Comparison is based on the match confidence value, which
+     *   allows CharsetDetector.detectAll() to order its results.
+     *
+     * @param o the CharsetMatch object to compare against.
+     * @return a negative integer, zero, or a positive integer as the
+     *          confidence level of this CharsetMatch
+     *          is less than, equal to, or greater than that of
+     *          the argument.
+     * @throws ClassCastException if the argument is not a CharsetMatch.
+     * @stable ICU 3.4
+     */
+    public int compareTo(CharsetMatch other) {
+        int compareResult = 0;
+        if (this.fConfidence > other.fConfidence) {
+            compareResult = 1;
+        } else if (this.fConfidence < other.fConfidence) {
+            compareResult = -1;
+        }
+        return compareResult;
+    }
+
+    /**
+     * compare this CharsetMatch to another based on confidence value
+     * @param o the CharsetMatch object to compare against
+     * @return true if equal
+     */
+    public boolean equals(Object o) {
+        if (o instanceof CharsetMatch) {
+            CharsetMatch that = (CharsetMatch) o;
+            return (this.fConfidence == that.fConfidence);
+        }
+
+        return false;
+    }
+
+    /**
+     * generates a hashCode based on the confidence value
+     * @return the hashCode
+     */
+    public int hashCode() {
+        return fConfidence;
+    }
+    //   gave us a byte array.
+
+    public String toString() {
+        String s = "Match of " + fRecognizer.getName();
+        if (fRecognizer.getLanguage() != null) {
+            s += " in " + fRecognizer.getLanguage();
+        }
+        s += " with confidence " + fConfidence;
+        return s;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 16835d6..129c9a8 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,163 +1,163 @@
-/*
-*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
-package org.apache.tika.parser.txt;
-
-/**
- * class CharsetRecog_2022  part of the ICU charset detection imlementation.
- * This is a superclass for the individual detectors for
- * each of the detectable members of the ISO 2022 family
- * of encodings.
- * <p/>
- * The separate classes are nested within this class.
- *
- * @internal
- */
-abstract class CharsetRecog_2022 extends CharsetRecognizer {
-
-
-    /**
-     * Matching function shared among the 2022 detectors JP, CN and KR
-     * Counts up the number of legal an unrecognized escape sequences in
-     * the sample of text, and computes a score based on the total number &
-     * the proportion that fit the encoding.
-     *
-     * @param text            the byte buffer containing text to analyse
-     * @param textLen         the size of the text in the byte.
-     * @param escapeSequences the byte escape sequences to test for.
-     * @return match quality, in the range of 0-100.
-     */
-    int match(byte[] text, int textLen, byte[][] escapeSequences) {
-        int i, j;
-        int escN;
-        int hits = 0;
-        int misses = 0;
-        int shifts = 0;
-        int quality;
-        scanInput:
-        for (i = 0; i < textLen; i++) {
-            if (text[i] == 0x1b) {
-                checkEscapes:
-                for (escN = 0; escN < escapeSequences.length; escN++) {
-                    byte[] seq = escapeSequences[escN];
-
-                    if ((textLen - i) < seq.length) {
-                        continue checkEscapes;
-                    }
-
-                    for (j = 1; j < seq.length; j++) {
-                        if (seq[j] != text[i + j]) {
-                            continue checkEscapes;
-                        }
-                    }
-
-                    hits++;
-                    i += seq.length - 1;
-                    continue scanInput;
-                }
-
-                misses++;
-            }
-
-            if (text[i] == 0x0e || text[i] == 0x0f) {
-                // Shift in/out
-                shifts++;
-            }
-        }
-
-        if (hits == 0) {
-            return 0;
-        }
-
-        //
-        // Initial quality is based on relative proportion of recongized vs.
-        //   unrecognized escape sequences. 
-        //   All good:  quality = 100;
-        //   half or less good: quality = 0;
-        //   linear inbetween.
-        quality = (100 * hits - 100 * misses) / (hits + misses);
-
-        // Back off quality if there were too few escape sequences seen.
-        //   Include shifts in this computation, so that KR does not get penalized
-        //   for having only a single Escape sequence, but many shifts.
-        if (hits + shifts < 5) {
-            quality -= (5 - (hits + shifts)) * 10;
-        }
-
-        if (quality < 0) {
-            quality = 0;
-        }
-        return quality;
-    }
-
-
-    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
-                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
-                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
-                {0x1b, 0x24, 0x41},         // GB 2312-80
-                {0x1b, 0x24, 0x42},         // JIS X 208-1983
-                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
-                {0x1b, 0x28, 0x42},         // ASCII
-                {0x1b, 0x28, 0x48},         // JIS-Roman
-                {0x1b, 0x28, 0x49},         // Half-width katakana
-                {0x1b, 0x28, 0x4a},         // JIS-Roman
-                {0x1b, 0x2e, 0x41},         // ISO 8859-1
-                {0x1b, 0x2e, 0x46}          // ISO 8859-7
-        };
-
-        String getName() {
-            return "ISO-2022-JP";
-        }
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-    }
-
-    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x29, 0x43}
-        };
-
-        String getName() {
-            return "ISO-2022-KR";
-        }
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-
-    }
-
-    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
-                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
-                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
-                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
-                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
-                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
-                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
-                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
-                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
-                {0x1b, 0x4e},               // SS2
-                {0x1b, 0x4f},               // SS3
-        };
-
-        String getName() {
-            return "ISO-2022-CN";
-        }
-
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-    }
-
-}
-
+/*
+*******************************************************************************
+* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package org.apache.tika.parser.txt;
+
+/**
+ * class CharsetRecog_2022  part of the ICU charset detection imlementation.
+ * This is a superclass for the individual detectors for
+ * each of the detectable members of the ISO 2022 family
+ * of encodings.
+ * <p/>
+ * The separate classes are nested within this class.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_2022 extends CharsetRecognizer {
+
+
+    /**
+     * Matching function shared among the 2022 detectors JP, CN and KR
+     * Counts up the number of legal an unrecognized escape sequences in
+     * the sample of text, and computes a score based on the total number &
+     * the proportion that fit the encoding.
+     *
+     * @param text            the byte buffer containing text to analyse
+     * @param textLen         the size of the text in the byte.
+     * @param escapeSequences the byte escape sequences to test for.
+     * @return match quality, in the range of 0-100.
+     */
+    int match(byte[] text, int textLen, byte[][] escapeSequences) {
+        int i, j;
+        int escN;
+        int hits = 0;
+        int misses = 0;
+        int shifts = 0;
+        int quality;
+        scanInput:
+        for (i = 0; i < textLen; i++) {
+            if (text[i] == 0x1b) {
+                checkEscapes:
+                for (escN = 0; escN < escapeSequences.length; escN++) {
+                    byte[] seq = escapeSequences[escN];
+
+                    if ((textLen - i) < seq.length) {
+                        continue checkEscapes;
+                    }
+
+                    for (j = 1; j < seq.length; j++) {
+                        if (seq[j] != text[i + j]) {
+                            continue checkEscapes;
+                        }
+                    }
+
+                    hits++;
+                    i += seq.length - 1;
+                    continue scanInput;
+                }
+
+                misses++;
+            }
+
+            if (text[i] == 0x0e || text[i] == 0x0f) {
+                // Shift in/out
+                shifts++;
+            }
+        }
+
+        if (hits == 0) {
+            return 0;
+        }
+
+        //
+        // Initial quality is based on relative proportion of recongized vs.
+        //   unrecognized escape sequences. 
+        //   All good:  quality = 100;
+        //   half or less good: quality = 0;
+        //   linear inbetween.
+        quality = (100 * hits - 100 * misses) / (hits + misses);
+
+        // Back off quality if there were too few escape sequences seen.
+        //   Include shifts in this computation, so that KR does not get penalized
+        //   for having only a single Escape sequence, but many shifts.
+        if (hits + shifts < 5) {
+            quality -= (5 - (hits + shifts)) * 10;
+        }
+
+        if (quality < 0) {
+            quality = 0;
+        }
+        return quality;
+    }
+
+
+    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
+                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
+                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
+                {0x1b, 0x24, 0x41},         // GB 2312-80
+                {0x1b, 0x24, 0x42},         // JIS X 208-1983
+                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
+                {0x1b, 0x28, 0x42},         // ASCII
+                {0x1b, 0x28, 0x48},         // JIS-Roman
+                {0x1b, 0x28, 0x49},         // Half-width katakana
+                {0x1b, 0x28, 0x4a},         // JIS-Roman
+                {0x1b, 0x2e, 0x41},         // ISO 8859-1
+                {0x1b, 0x2e, 0x46}          // ISO 8859-7
+        };
+
+        String getName() {
+            return "ISO-2022-JP";
+        }
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+    }
+
+    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x29, 0x43}
+        };
+
+        String getName() {
+            return "ISO-2022-KR";
+        }
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+
+    }
+
+    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
+                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
+                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
+                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
+                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
+                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
+                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
+                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
+                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
+                {0x1b, 0x4e},               // SS2
+                {0x1b, 0x4f},               // SS3
+        };
+
+        String getName() {
+            return "ISO-2022-CN";
+        }
+
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index ad69fa0..55a3957 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,99 +1,99 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Charset recognizer for UTF-8
- *
- * @internal
- */
-class CharsetRecog_UTF8 extends CharsetRecognizer {
-
-    String getName() {
-        return "UTF-8";
-    }
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
-     */
-    int match(CharsetDetector det) {
-        boolean hasBOM = false;
-        int numValid = 0;
-        int numInvalid = 0;
-        byte input[] = det.fRawInput;
-        int i;
-        int trailBytes = 0;
-        int confidence;
-
-        if (det.fRawLength >= 3 &&
-                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
-            hasBOM = true;
-        }
-
-        // Scan for multi-byte sequences
-        for (i = 0; i < det.fRawLength; i++) {
-            int b = input[i];
-            if ((b & 0x80) == 0) {
-                continue;   // ASCII
-            }
-
-            // Hi bit on char found.  Figure out how long the sequence should be
-            if ((b & 0x0e0) == 0x0c0) {
-                trailBytes = 1;
-            } else if ((b & 0x0f0) == 0x0e0) {
-                trailBytes = 2;
-            } else if ((b & 0x0f8) == 0xf0) {
-                trailBytes = 3;
-            } else {
-                numInvalid++;
-                if (numInvalid > 5) {
-                    break;
-                }
-                trailBytes = 0;
-            }
-
-            // Verify that we've got the right number of trail bytes in the sequence
-            for (; ; ) {
-                i++;
-                if (i >= det.fRawLength) {
-                    break;
-                }
-                b = input[i];
-                if ((b & 0xc0) != 0x080) {
-                    numInvalid++;
-                    break;
-                }
-                if (--trailBytes == 0) {
-                    numValid++;
-                    break;
-                }
-            }
-
-        }
-
-        // Cook up some sort of confidence score, based on presense of a BOM
-        //    and the existence of valid and/or invalid multi-byte sequences.
-        confidence = 0;
-        if (hasBOM && numInvalid == 0) {
-            confidence = 100;
-        } else if (hasBOM && numValid > numInvalid * 10) {
-            confidence = 80;
-        } else if (numValid > 3 && numInvalid == 0) {
-            confidence = 100;
-        } else if (numValid > 0 && numInvalid == 0) {
-            confidence = 80;
-        } else if (numValid == 0 && numInvalid == 0) {
-            // Plain ASCII.  
-            confidence = 10;
-        } else if (numValid > numInvalid * 10) {
-            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
-            confidence = 25;
-        }
-        return confidence;
-    }
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Charset recognizer for UTF-8
+ *
+ * @internal
+ */
+class CharsetRecog_UTF8 extends CharsetRecognizer {
+
+    String getName() {
+        return "UTF-8";
+    }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+     */
+    int match(CharsetDetector det) {
+        boolean hasBOM = false;
+        int numValid = 0;
+        int numInvalid = 0;
+        byte input[] = det.fRawInput;
+        int i;
+        int trailBytes = 0;
+        int confidence;
+
+        if (det.fRawLength >= 3 &&
+                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
+            hasBOM = true;
+        }
+
+        // Scan for multi-byte sequences
+        for (i = 0; i < det.fRawLength; i++) {
+            int b = input[i];
+            if ((b & 0x80) == 0) {
+                continue;   // ASCII
+            }
+
+            // Hi bit on char found.  Figure out how long the sequence should be
+            if ((b & 0x0e0) == 0x0c0) {
+                trailBytes = 1;
+            } else if ((b & 0x0f0) == 0x0e0) {
+                trailBytes = 2;
+            } else if ((b & 0x0f8) == 0xf0) {
+                trailBytes = 3;
+            } else {
+                numInvalid++;
+                if (numInvalid > 5) {
+                    break;
+                }
+                trailBytes = 0;
+            }
+
+            // Verify that we've got the right number of trail bytes in the sequence
+            for (; ; ) {
+                i++;
+                if (i >= det.fRawLength) {
+                    break;
+                }
+                b = input[i];
+                if ((b & 0xc0) != 0x080) {
+                    numInvalid++;
+                    break;
+                }
+                if (--trailBytes == 0) {
+                    numValid++;
+                    break;
+                }
+            }
+
+        }
+
+        // Cook up some sort of confidence score, based on presense of a BOM
+        //    and the existence of valid and/or invalid multi-byte sequences.
+        confidence = 0;
+        if (hasBOM && numInvalid == 0) {
+            confidence = 100;
+        } else if (hasBOM && numValid > numInvalid * 10) {
+            confidence = 80;
+        } else if (numValid > 3 && numInvalid == 0) {
+            confidence = 100;
+        } else if (numValid > 0 && numInvalid == 0) {
+            confidence = 80;
+        } else if (numValid == 0 && numInvalid == 0) {
+            // Plain ASCII.  
+            confidence = 10;
+        } else if (numValid > numInvalid * 10) {
+            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
+            confidence = 25;
+        }
+        return confidence;
+    }
+
+}

[36/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
index cc22347..1695859 100644
--- a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.advanced.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.advanced.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/pom.xml b/tika-parser-modules/tika-parser-cad-module/pom.xml
index 6e7efb6..a9f8f31 100644
--- a/tika-parser-modules/tika-parser-cad-module/pom.xml
+++ b/tika-parser-modules/tika-parser-cad-module/pom.xml
@@ -1,56 +1,56 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-cad-module</artifactId>
-  <name>Apache Tika parser CAD module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    
-    <dependency>
-      <groupId>commons-lang</groupId>
-      <artifactId>commons-lang</artifactId>
-      <version>2.6</version>
-    </dependency>
-
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-cad-module</artifactId>
+  <name>Apache Tika parser CAD module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    
+    <dependency>
+      <groupId>commons-lang</groupId>
+      <artifactId>commons-lang</artifactId>
+      <version>2.6</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
index 4a23b73..29a099c 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.cad.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.cad.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 3f29c1f..875c4ee 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -1,356 +1,356 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.dwg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.io.StringUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * DWG (CAD Drawing) parser. This is a very basic parser, which just
- *  looks for bits of the headers.
- * Note that we use Apache POI for various parts of the processing, as
- *  lots of the low level string/int/short concepts are the same.
- */
-public class DWGParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -7744232583079169119L;
-
-    private static MediaType TYPE = MediaType.image("vnd.dwg");
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return Collections.singleton(TYPE);
-    }
-
-    /** The order of the fields in the header */
-    private static final Property[] HEADER_PROPERTIES_ENTRIES = {
-        TikaCoreProperties.TITLE, 
-        TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
-        TikaCoreProperties.CREATOR,
-        TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
-        TikaCoreProperties.COMMENTS,
-        TikaCoreProperties.MODIFIER,
-        null, // Unknown?
-        TikaCoreProperties.RELATION, // Hyperlink
-    };
-
-    /** For the 2000 file, they're indexed */
-    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
-       null, 
-       TikaCoreProperties.RELATION, // 0x01
-       TikaCoreProperties.TITLE,    // 0x02
-       TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,  // 0x03
-       TikaCoreProperties.CREATOR,   // 0x04
-       null,
-       TikaCoreProperties.COMMENTS,// 0x06 
-       TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,    // 0x07
-       TikaCoreProperties.MODIFIER, // 0x08
-   };
-
-    private static final String HEADER_2000_PROPERTIES_MARKER_STR =
-            "DWGPROPS COOKIE";
-
-    private static final byte[] HEADER_2000_PROPERTIES_MARKER =
-            new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
-
-    static {
-        StringUtil.putCompressedUnicode(
-                HEADER_2000_PROPERTIES_MARKER_STR,
-                HEADER_2000_PROPERTIES_MARKER, 0);
-    }
-
-    /** 
-     * How far to skip after the last standard property, before
-     *  we find any custom properties that might be there.
-     */
-    private static final int CUSTOM_PROPERTIES_SKIP = 20;
-    
-    /** 
-     * The value of padding bytes other than 0 in some DWG files.
-     */
-    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, TikaException, SAXException {
-        // First up, which version of the format are we handling?
-        byte[] header = new byte[128];
-        IOUtils.readFully(stream, header);
-        String version = new String(header, 0, 6, "US-ASCII");
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        if (version.equals("AC1015")) {
-            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-            if (skipTo2000PropertyInfoSection(stream, header)) {
-                get2000Props(stream,metadata,xhtml);
-            }
-        } else if (version.equals("AC1018")) {
-            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-            if (skipToPropertyInfoSection(stream, header)) {
-                get2004Props(stream,metadata,xhtml);
-            }
-        } else if (version.equals("AC1021") || version.equals("AC1024")) {
-            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-            if (skipToPropertyInfoSection(stream, header)) {
-                get2007and2010Props(stream,metadata,xhtml);
-            }
-        } else {
-            throw new TikaException(
-                    "Unsupported AutoCAD drawing version: " + version);
-        }
-
-        xhtml.endDocument();
-    }
-
-    /**
-     * Stored as US-ASCII
-     */
-    private void get2004Props(
-            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
-            throws IOException, TikaException, SAXException {
-       // Standard properties
-        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
-            String headerValue = read2004String(stream);
-            handleHeader(i, headerValue, metadata, xhtml);
-        }
-
-        // Custom properties
-        int customCount = skipToCustomProperties(stream);
-        for (int i = 0; i < customCount; i++) {
-           String propName = read2004String(stream);
-           String propValue = read2004String(stream);
-           if(propName.length() > 0 && propValue.length() > 0) {
-              metadata.add(propName, propValue);
-           }
-        }
-    }
-
-    private String read2004String(InputStream stream) throws IOException, TikaException {
-       int stringLen = EndianUtils.readUShortLE(stream);
-
-       byte[] stringData = new byte[stringLen];
-       IOUtils.readFully(stream, stringData);
-
-       // Often but not always null terminated
-       if (stringData[stringLen-1] == 0) {
-           stringLen--;
-       }
-       String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
-       return value;
-    }
-
-    /**
-     * Stored as UCS2, so 16 bit "unicode"
-     */
-    private void get2007and2010Props(
-            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
-            throws IOException, TikaException, SAXException {
-        // Standard properties
-        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
-            String headerValue = read2007and2010String(stream);
-            handleHeader(i, headerValue, metadata, xhtml);
-        }
-
-        // Custom properties
-        int customCount = skipToCustomProperties(stream);
-        for (int i = 0; i < customCount; i++) {
-           String propName = read2007and2010String(stream);
-           String propValue = read2007and2010String(stream);
-           if(propName.length() > 0 && propValue.length() > 0) {
-              metadata.add(propName, propValue);
-           }
-        }
-    }
-
-    private String read2007and2010String(InputStream stream) throws IOException, TikaException {
-       int stringLen = EndianUtils.readUShortLE(stream);
-
-       byte[] stringData = new byte[stringLen * 2];
-       IOUtils.readFully(stream, stringData);
-       String value = StringUtil.getFromUnicodeLE(stringData);
-
-       // Some strings are null terminated
-       if(value.charAt(value.length()-1) == 0) {
-           value = value.substring(0, value.length()-1);
-       }
-
-       return value;
-    }
-
-    private void get2000Props(
-            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
-            throws IOException, TikaException, SAXException {
-        int propCount = 0;
-        while(propCount < 30) {
-            int propIdx = EndianUtils.readUShortLE(stream);
-            int length = EndianUtils.readUShortLE(stream);
-            int valueType = stream.read();
-            
-            if(propIdx == 0x28) {
-               // This one seems not to follow the pattern
-               length = 0x19;
-            } else if(propIdx == 90) {
-               // We think this means the end of properties
-               break;
-            }
-
-            byte[] value = new byte[length];
-            IOUtils.readFully(stream, value);
-            if(valueType == 0x1e) {
-                // Normal string, good
-                String val = StringUtil.getFromCompressedUnicode(value, 0, length);
-                
-                // Is it one we can look up by index?
-                if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
-                   metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
-                   xhtml.element("p", val);
-                } else if(propIdx == 0x012c) {
-                   int splitAt = val.indexOf('='); 
-                   if(splitAt > -1) {
-                      String propName = val.substring(0, splitAt);
-                      String propVal = val.substring(splitAt+1);
-                      metadata.add(propName, propVal);
-                   }
-                }
-            } else {
-                // No idea...
-            }
-            
-            propCount++;
-        }
-    }
-
-    private void handleHeader(
-            int headerNumber, String value, Metadata metadata,
-            XHTMLContentHandler xhtml) throws SAXException {
-        if(value == null || value.length() == 0) {
-            return;
-        }
-
-        Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
-        if(headerProp != null) {
-            metadata.set(headerProp, value);
-        }
-
-        xhtml.element("p", value);
-    }
-
-    /**
-     * Grab the offset, then skip there
-     */
-    private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
-            throws IOException, TikaException {
-        // The offset is stored in the header from 0x20 onwards
-        long offsetToSection = EndianUtils.getLongLE(header, 0x20);
-        
-        // Sanity check the offset. Some files seem to use a different format,
-        //  and the offset isn't available at 0x20. Until we can work out how
-        //  to find the offset in those files, skip them if detected
-        if (offsetToSection > 0xa00000l) {
-           // Header should never be more than 10mb into the file, something is wrong
-           offsetToSection = 0;
-        }
-        
-        // Work out how far to skip, and sanity check
-        long toSkip = offsetToSection - header.length;
-        if(offsetToSection == 0){
-            return false;
-        }        
-        while (toSkip > 0) {
-            byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
-            IOUtils.readFully(stream, skip);
-            toSkip -= skip.length;
-        }
-        return true;
-    }
-
-    /**
-     * We think it can be anywhere...
-     */
-    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
-            throws IOException {
-       int val = 0;
-       while(val != -1) {
-          val = stream.read();
-          if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
-             boolean going = true;
-             for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
-                val = stream.read();
-                if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
-             }
-             if(going) {
-                // Bingo, found it
-                return true;
-             }
-          }
-       }
-       return false;
-    }
-
-    private int skipToCustomProperties(InputStream stream) 
-            throws IOException, TikaException {
-       // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
-       byte[] padding = new byte[4];
-       IOUtils.readFully(stream, padding);
-       if((padding[0] == 0 && padding[1] == 0 &&
-             padding[2] == 0 && padding[3] == 0) ||
-             (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] && 
-               padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
-               padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
-               padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
-           
-          // Looks hopeful, skip on
-          padding = new byte[CUSTOM_PROPERTIES_SKIP];
-          IOUtils.readFully(stream, padding);
-          
-          // We should now have the count
-          int count = EndianUtils.readUShortLE(stream);
-          
-          // Sanity check it
-          if(count > 0 && count < 0x7f) {
-             // Looks plausible
-             return count;
-          } else {
-             // No properties / count is too high to trust
-             return 0;
-          }
-       } else {
-          // No padding. That probably means no custom props
-          return 0;
-       }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.io.StringUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * DWG (CAD Drawing) parser. This is a very basic parser, which just
+ *  looks for bits of the headers.
+ * Note that we use Apache POI for various parts of the processing, as
+ *  lots of the low level string/int/short concepts are the same.
+ */
+public class DWGParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -7744232583079169119L;
+
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    /** The order of the fields in the header */
+    private static final Property[] HEADER_PROPERTIES_ENTRIES = {
+        TikaCoreProperties.TITLE, 
+        TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+        TikaCoreProperties.CREATOR,
+        TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
+        TikaCoreProperties.COMMENTS,
+        TikaCoreProperties.MODIFIER,
+        null, // Unknown?
+        TikaCoreProperties.RELATION, // Hyperlink
+    };
+
+    /** For the 2000 file, they're indexed */
+    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
+       null, 
+       TikaCoreProperties.RELATION, // 0x01
+       TikaCoreProperties.TITLE,    // 0x02
+       TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,  // 0x03
+       TikaCoreProperties.CREATOR,   // 0x04
+       null,
+       TikaCoreProperties.COMMENTS,// 0x06 
+       TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,    // 0x07
+       TikaCoreProperties.MODIFIER, // 0x08
+   };
+
+    private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+            "DWGPROPS COOKIE";
+
+    private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+            new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+    static {
+        StringUtil.putCompressedUnicode(
+                HEADER_2000_PROPERTIES_MARKER_STR,
+                HEADER_2000_PROPERTIES_MARKER, 0);
+    }
+
+    /** 
+     * How far to skip after the last standard property, before
+     *  we find any custom properties that might be there.
+     */
+    private static final int CUSTOM_PROPERTIES_SKIP = 20;
+    
+    /** 
+     * The value of padding bytes other than 0 in some DWG files.
+     */
+    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+        // First up, which version of the format are we handling?
+        byte[] header = new byte[128];
+        IOUtils.readFully(stream, header);
+        String version = new String(header, 0, 6, "US-ASCII");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        if (version.equals("AC1015")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipTo2000PropertyInfoSection(stream, header)) {
+                get2000Props(stream,metadata,xhtml);
+            }
+        } else if (version.equals("AC1018")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipToPropertyInfoSection(stream, header)) {
+                get2004Props(stream,metadata,xhtml);
+            }
+        } else if (version.equals("AC1021") || version.equals("AC1024")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipToPropertyInfoSection(stream, header)) {
+                get2007and2010Props(stream,metadata,xhtml);
+            }
+        } else {
+            throw new TikaException(
+                    "Unsupported AutoCAD drawing version: " + version);
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Stored as US-ASCII
+     */
+    private void get2004Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+       // Standard properties
+        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+            String headerValue = read2004String(stream);
+            handleHeader(i, headerValue, metadata, xhtml);
+        }
+
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2004String(stream);
+           String propValue = read2004String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+
+    private String read2004String(InputStream stream) throws IOException, TikaException {
+       int stringLen = EndianUtils.readUShortLE(stream);
+
+       byte[] stringData = new byte[stringLen];
+       IOUtils.readFully(stream, stringData);
+
+       // Often but not always null terminated
+       if (stringData[stringLen-1] == 0) {
+           stringLen--;
+       }
+       String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+       return value;
+    }
+
+    /**
+     * Stored as UCS2, so 16 bit "unicode"
+     */
+    private void get2007and2010Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+        // Standard properties
+        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+            String headerValue = read2007and2010String(stream);
+            handleHeader(i, headerValue, metadata, xhtml);
+        }
+
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2007and2010String(stream);
+           String propValue = read2007and2010String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+
+    private String read2007and2010String(InputStream stream) throws IOException, TikaException {
+       int stringLen = EndianUtils.readUShortLE(stream);
+
+       byte[] stringData = new byte[stringLen * 2];
+       IOUtils.readFully(stream, stringData);
+       String value = StringUtil.getFromUnicodeLE(stringData);
+
+       // Some strings are null terminated
+       if(value.charAt(value.length()-1) == 0) {
+           value = value.substring(0, value.length()-1);
+       }
+
+       return value;
+    }
+
+    private void get2000Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+        int propCount = 0;
+        while(propCount < 30) {
+            int propIdx = EndianUtils.readUShortLE(stream);
+            int length = EndianUtils.readUShortLE(stream);
+            int valueType = stream.read();
+            
+            if(propIdx == 0x28) {
+               // This one seems not to follow the pattern
+               length = 0x19;
+            } else if(propIdx == 90) {
+               // We think this means the end of properties
+               break;
+            }
+
+            byte[] value = new byte[length];
+            IOUtils.readFully(stream, value);
+            if(valueType == 0x1e) {
+                // Normal string, good
+                String val = StringUtil.getFromCompressedUnicode(value, 0, length);
+                
+                // Is it one we can look up by index?
+                if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+                   metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+                   xhtml.element("p", val);
+                } else if(propIdx == 0x012c) {
+                   int splitAt = val.indexOf('='); 
+                   if(splitAt > -1) {
+                      String propName = val.substring(0, splitAt);
+                      String propVal = val.substring(splitAt+1);
+                      metadata.add(propName, propVal);
+                   }
+                }
+            } else {
+                // No idea...
+            }
+            
+            propCount++;
+        }
+    }
+
+    private void handleHeader(
+            int headerNumber, String value, Metadata metadata,
+            XHTMLContentHandler xhtml) throws SAXException {
+        if(value == null || value.length() == 0) {
+            return;
+        }
+
+        Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
+        if(headerProp != null) {
+            metadata.set(headerProp, value);
+        }
+
+        xhtml.element("p", value);
+    }
+
+    /**
+     * Grab the offset, then skip there
+     */
+    private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
+            throws IOException, TikaException {
+        // The offset is stored in the header from 0x20 onwards
+        long offsetToSection = EndianUtils.getLongLE(header, 0x20);
+        
+        // Sanity check the offset. Some files seem to use a different format,
+        //  and the offset isn't available at 0x20. Until we can work out how
+        //  to find the offset in those files, skip them if detected
+        if (offsetToSection > 0xa00000l) {
+           // Header should never be more than 10mb into the file, something is wrong
+           offsetToSection = 0;
+        }
+        
+        // Work out how far to skip, and sanity check
+        long toSkip = offsetToSection - header.length;
+        if(offsetToSection == 0){
+            return false;
+        }        
+        while (toSkip > 0) {
+            byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
+            IOUtils.readFully(stream, skip);
+            toSkip -= skip.length;
+        }
+        return true;
+    }
+
+    /**
+     * We think it can be anywhere...
+     */
+    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
+            throws IOException {
+       int val = 0;
+       while(val != -1) {
+          val = stream.read();
+          if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+             boolean going = true;
+             for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+                val = stream.read();
+                if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+             }
+             if(going) {
+                // Bingo, found it
+                return true;
+             }
+          }
+       }
+       return false;
+    }
+
+    private int skipToCustomProperties(InputStream stream) 
+            throws IOException, TikaException {
+       // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
+       byte[] padding = new byte[4];
+       IOUtils.readFully(stream, padding);
+       if((padding[0] == 0 && padding[1] == 0 &&
+             padding[2] == 0 && padding[3] == 0) ||
+             (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] && 
+               padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+               padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+               padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+           
+          // Looks hopeful, skip on
+          padding = new byte[CUSTOM_PROPERTIES_SKIP];
+          IOUtils.readFully(stream, padding);
+          
+          // We should now have the count
+          int count = EndianUtils.readUShortLE(stream);
+          
+          // Sanity check it
+          if(count > 0 && count < 0x7f) {
+             // Looks plausible
+             return count;
+          } else {
+             // No properties / count is too high to trust
+             return 0;
+          }
+       } else {
+          // No padding. That probably means no custom props
+          return 0;
+       }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/pom.xml b/tika-parser-modules/tika-parser-code-module/pom.xml
index cf59c0e..5d33f82 100644
--- a/tika-parser-modules/tika-parser-code-module/pom.xml
+++ b/tika-parser-modules/tika-parser-code-module/pom.xml
@@ -1,69 +1,69 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-code-module</artifactId>
-  <name>Apache Tika parser code module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.ow2.asm</groupId>
-      <artifactId>asm</artifactId>
-      <version>5.0.4</version>
-    </dependency>
-    <dependency>
-      <groupId>org.codelibs</groupId>
-      <artifactId>jhighlight</artifactId>
-      <version>1.0.2</version>
-    </dependency>
-    <dependency>
-      <groupId>org.ccil.cowan.tagsoup</groupId>
-      <artifactId>tagsoup</artifactId>
-      <version>1.2.1</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-code-module</artifactId>
+  <name>Apache Tika parser code module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.ow2.asm</groupId>
+      <artifactId>asm</artifactId>
+      <version>5.0.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.codelibs</groupId>
+      <artifactId>jhighlight</artifactId>
+      <version>1.0.2</version>
+    </dependency>
+    <dependency>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
+      <version>1.2.1</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
index 040618d..095e643 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.code.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.code.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
index 48f8cbf..481046f 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for Java .class files.
- */
-public class ClassParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -3531388963354454357L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.application("java-vm"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        new XHTMLClassVisitor(handler, metadata).parse(stream);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -3531388963354454357L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("java-vm"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        new XHTMLClassVisitor(handler, metadata).parse(stream);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 03deb43..c8ea317 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -1,323 +1,323 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.objectweb.asm.AnnotationVisitor;
-import org.objectweb.asm.Attribute;
-import org.objectweb.asm.ClassReader;
-import org.objectweb.asm.ClassVisitor;
-import org.objectweb.asm.FieldVisitor;
-import org.objectweb.asm.MethodVisitor;
-import org.objectweb.asm.Opcodes;
-import org.objectweb.asm.Type;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Class visitor that generates XHTML SAX events to describe the
- * contents of the visited class.
- */
-class XHTMLClassVisitor extends ClassVisitor {
-
-    private final XHTMLContentHandler xhtml;
-
-    private final Metadata metadata;
-
-    private Type type;
-
-    private String packageName;
-
-    public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
-        super(Opcodes.ASM5);
-        this.xhtml = new XHTMLContentHandler(handler, metadata);
-        this.metadata = metadata;
-    }
-
-    public void parse(InputStream stream)
-            throws TikaException, SAXException, IOException {
-        try {
-            ClassReader reader = new ClassReader(stream);
-            reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
-        } catch (RuntimeException e) {
-            if (e.getCause() instanceof SAXException) {
-                throw (SAXException) e.getCause();
-            } else {
-                throw new TikaException("Failed to parse a Java class", e);
-            }
-        }
-    }
-
-    public void visit(
-            int version, int access, String name, String signature,
-            String superName, String[] interfaces) {
-        type = Type.getObjectType(name);
-
-        String className = type.getClassName();
-        int dot = className.lastIndexOf('.');
-        if (dot != -1) {
-            packageName = className.substring(0, dot);
-            className = className.substring(dot + 1);
-        }
-
-        metadata.set(TikaCoreProperties.TITLE, className);
-        metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
-
-        try {
-            xhtml.startDocument();
-            xhtml.startElement("pre");
-
-            if (packageName != null) {
-                writeKeyword("package");
-                xhtml.characters(" " + packageName + ";\n");
-            }
-
-            writeAccess(access);
-            if (isSet(access, Opcodes.ACC_INTERFACE)) {
-                writeKeyword("interface");
-                writeSpace();
-                writeType(type);
-                writeSpace();
-                writeInterfaces("extends", interfaces);
-            } else if (isSet(access, Opcodes.ACC_ENUM)) {
-                writeKeyword("enum");
-                writeSpace();
-                writeType(type);
-                writeSpace();
-            } else {
-                writeKeyword("class");
-                writeSpace();
-                writeType(type);
-                writeSpace();
-                if (superName != null) {
-                    Type superType = Type.getObjectType(superName);
-                    if (!superType.getClassName().equals("java.lang.Object")) {
-                        writeKeyword("extends");
-                        writeSpace();
-                        writeType(superType);
-                        writeSpace();
-                    }
-                }
-                writeInterfaces("implements", interfaces);
-            }
-            xhtml.characters("{\n");
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private void writeInterfaces(String keyword, String[] interfaces)
-            throws SAXException {
-        if (interfaces != null && interfaces.length > 0) {
-            writeKeyword(keyword);
-            String separator = " ";
-            for (String iface : interfaces) {
-                xhtml.characters(separator);
-                writeType(Type.getObjectType(iface));
-                separator = ", ";
-            }
-            writeSpace();
-        }
-    }
-
-    public void visitEnd() {
-        try {
-            xhtml.characters("}\n");
-            xhtml.endElement("pre");
-            xhtml.endDocument();
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    /**
-     * Ignored.
-     */
-    public void visitOuterClass(String owner, String name, String desc) {
-    }
-
-    /**
-     * Ignored.
-     */
-    public void visitSource(String source, String debug) {
-    }
-
-
-    /**
-     * Ignored.
-     */
-    public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
-        return null;
-    }
-
-    /**
-     * Ignored.
-     */
-    public void visitAttribute(Attribute attr) {
-    }
-
-    /**
-     * Ignored.
-     */
-    public void visitInnerClass(
-            String name, String outerName, String innerName, int access) {
-    }
-
-    /**
-     * Visits a field.
-     */
-    public FieldVisitor visitField(
-            int access, String name, String desc, String signature,
-            Object value) {
-        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
-            try {
-                xhtml.characters("    ");
-                writeAccess(access);
-                writeType(Type.getType(desc));
-                writeSpace();
-                writeIdentifier(name);
-
-                if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
-                    xhtml.characters(" = ");
-                    xhtml.characters(value.toString());
-                }
-
-                writeSemicolon();
-                writeNewline();
-            } catch (SAXException e) {
-                throw new RuntimeException(e);
-            }
-        }
-
-        return null;
-    }
-
-    /**
-     * Visits a method.
-     */
-    public MethodVisitor visitMethod(
-            int access, String name, String desc, String signature,
-            String[] exceptions) {
-        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
-            try {
-                xhtml.characters("    ");
-                writeAccess(access);
-                writeType(Type.getReturnType(desc));
-                writeSpace();
-                if ("<init>".equals(name)) {
-                    writeType(type);
-                } else {
-                    writeIdentifier(name);
-                }
-
-                xhtml.characters("(");
-                String separator = "";
-                for (Type arg : Type.getArgumentTypes(desc)) {
-                    xhtml.characters(separator);
-                    writeType(arg);
-                    separator = ", ";
-                }
-                xhtml.characters(")");
-
-                if (exceptions != null && exceptions.length > 0) {
-                    writeSpace();
-                    writeKeyword("throws");
-                    separator = " ";
-                    for (String exception : exceptions) {
-                        xhtml.characters(separator);
-                        writeType(Type.getObjectType(exception));
-                        separator = ", ";
-                    }
-                }
-
-                writeSemicolon();
-                writeNewline();
-            } catch (SAXException e) {
-                throw new RuntimeException(e);
-            }
-        }
-
-        return null;
-    }
-
-    private void writeIdentifier(String identifier) throws SAXException {
-        xhtml.startElement("span", "class", "java-identifier");
-        xhtml.characters(identifier);
-        xhtml.endElement("span");
-    }
-
-    private void writeKeyword(String keyword) throws SAXException {
-        xhtml.startElement("span", "class", "java-keyword");
-        xhtml.characters(keyword);
-        xhtml.endElement("span");
-    }
-
-    private void writeSemicolon() throws SAXException {
-        xhtml.characters(";");
-    }
-
-    private void writeSpace() throws SAXException {
-        xhtml.characters(" ");
-    }
-
-    private void writeNewline() throws SAXException {
-        xhtml.characters("\n");
-    }
-
-    private void writeAccess(int access) throws SAXException {
-        writeAccess(access, Opcodes.ACC_PRIVATE, "private");
-        writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
-        writeAccess(access, Opcodes.ACC_PUBLIC, "public");
-        writeAccess(access, Opcodes.ACC_STATIC, "static");
-        writeAccess(access, Opcodes.ACC_FINAL, "final");
-        writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
-        writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
-        writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
-        writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
-        writeAccess(access, Opcodes.ACC_NATIVE, "native");
-    }
-
-    private void writeAccess(int access, int code, String keyword)
-            throws SAXException {
-        if (isSet(access, code)) {
-            writeKeyword(keyword);
-            xhtml.characters(" ");
-        }
-    }
-
-    private void writeType(Type type) throws SAXException {
-        String name = type.getClassName();
-        if (name.startsWith(packageName + ".")) {
-            xhtml.characters(name.substring(packageName.length() + 1));
-        } else if (name.startsWith("java.lang.")) {
-            xhtml.characters(name.substring("java.lang.".length()));
-        } else {
-            xhtml.characters(name);
-        }
-    }
-
-    private static boolean isSet(int value, int flag) {
-        return (value & flag) != 0;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.objectweb.asm.AnnotationVisitor;
+import org.objectweb.asm.Attribute;
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+import org.objectweb.asm.Type;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Class visitor that generates XHTML SAX events to describe the
+ * contents of the visited class.
+ */
+class XHTMLClassVisitor extends ClassVisitor {
+
+    private final XHTMLContentHandler xhtml;
+
+    private final Metadata metadata;
+
+    private Type type;
+
+    private String packageName;
+
+    public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
+        super(Opcodes.ASM5);
+        this.xhtml = new XHTMLContentHandler(handler, metadata);
+        this.metadata = metadata;
+    }
+
+    public void parse(InputStream stream)
+            throws TikaException, SAXException, IOException {
+        try {
+            ClassReader reader = new ClassReader(stream);
+            reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
+        } catch (RuntimeException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Failed to parse a Java class", e);
+            }
+        }
+    }
+
+    public void visit(
+            int version, int access, String name, String signature,
+            String superName, String[] interfaces) {
+        type = Type.getObjectType(name);
+
+        String className = type.getClassName();
+        int dot = className.lastIndexOf('.');
+        if (dot != -1) {
+            packageName = className.substring(0, dot);
+            className = className.substring(dot + 1);
+        }
+
+        metadata.set(TikaCoreProperties.TITLE, className);
+        metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
+
+        try {
+            xhtml.startDocument();
+            xhtml.startElement("pre");
+
+            if (packageName != null) {
+                writeKeyword("package");
+                xhtml.characters(" " + packageName + ";\n");
+            }
+
+            writeAccess(access);
+            if (isSet(access, Opcodes.ACC_INTERFACE)) {
+                writeKeyword("interface");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+                writeInterfaces("extends", interfaces);
+            } else if (isSet(access, Opcodes.ACC_ENUM)) {
+                writeKeyword("enum");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+            } else {
+                writeKeyword("class");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+                if (superName != null) {
+                    Type superType = Type.getObjectType(superName);
+                    if (!superType.getClassName().equals("java.lang.Object")) {
+                        writeKeyword("extends");
+                        writeSpace();
+                        writeType(superType);
+                        writeSpace();
+                    }
+                }
+                writeInterfaces("implements", interfaces);
+            }
+            xhtml.characters("{\n");
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private void writeInterfaces(String keyword, String[] interfaces)
+            throws SAXException {
+        if (interfaces != null && interfaces.length > 0) {
+            writeKeyword(keyword);
+            String separator = " ";
+            for (String iface : interfaces) {
+                xhtml.characters(separator);
+                writeType(Type.getObjectType(iface));
+                separator = ", ";
+            }
+            writeSpace();
+        }
+    }
+
+    public void visitEnd() {
+        try {
+            xhtml.characters("}\n");
+            xhtml.endElement("pre");
+            xhtml.endDocument();
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitOuterClass(String owner, String name, String desc) {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitSource(String source, String debug) {
+    }
+
+
+    /**
+     * Ignored.
+     */
+    public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
+        return null;
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitAttribute(Attribute attr) {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitInnerClass(
+            String name, String outerName, String innerName, int access) {
+    }
+
+    /**
+     * Visits a field.
+     */
+    public FieldVisitor visitField(
+            int access, String name, String desc, String signature,
+            Object value) {
+        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+            try {
+                xhtml.characters("    ");
+                writeAccess(access);
+                writeType(Type.getType(desc));
+                writeSpace();
+                writeIdentifier(name);
+
+                if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
+                    xhtml.characters(" = ");
+                    xhtml.characters(value.toString());
+                }
+
+                writeSemicolon();
+                writeNewline();
+            } catch (SAXException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        return null;
+    }
+
+    /**
+     * Visits a method.
+     */
+    public MethodVisitor visitMethod(
+            int access, String name, String desc, String signature,
+            String[] exceptions) {
+        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+            try {
+                xhtml.characters("    ");
+                writeAccess(access);
+                writeType(Type.getReturnType(desc));
+                writeSpace();
+                if ("<init>".equals(name)) {
+                    writeType(type);
+                } else {
+                    writeIdentifier(name);
+                }
+
+                xhtml.characters("(");
+                String separator = "";
+                for (Type arg : Type.getArgumentTypes(desc)) {
+                    xhtml.characters(separator);
+                    writeType(arg);
+                    separator = ", ";
+                }
+                xhtml.characters(")");
+
+                if (exceptions != null && exceptions.length > 0) {
+                    writeSpace();
+                    writeKeyword("throws");
+                    separator = " ";
+                    for (String exception : exceptions) {
+                        xhtml.characters(separator);
+                        writeType(Type.getObjectType(exception));
+                        separator = ", ";
+                    }
+                }
+
+                writeSemicolon();
+                writeNewline();
+            } catch (SAXException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        return null;
+    }
+
+    private void writeIdentifier(String identifier) throws SAXException {
+        xhtml.startElement("span", "class", "java-identifier");
+        xhtml.characters(identifier);
+        xhtml.endElement("span");
+    }
+
+    private void writeKeyword(String keyword) throws SAXException {
+        xhtml.startElement("span", "class", "java-keyword");
+        xhtml.characters(keyword);
+        xhtml.endElement("span");
+    }
+
+    private void writeSemicolon() throws SAXException {
+        xhtml.characters(";");
+    }
+
+    private void writeSpace() throws SAXException {
+        xhtml.characters(" ");
+    }
+
+    private void writeNewline() throws SAXException {
+        xhtml.characters("\n");
+    }
+
+    private void writeAccess(int access) throws SAXException {
+        writeAccess(access, Opcodes.ACC_PRIVATE, "private");
+        writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
+        writeAccess(access, Opcodes.ACC_PUBLIC, "public");
+        writeAccess(access, Opcodes.ACC_STATIC, "static");
+        writeAccess(access, Opcodes.ACC_FINAL, "final");
+        writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
+        writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
+        writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
+        writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
+        writeAccess(access, Opcodes.ACC_NATIVE, "native");
+    }
+
+    private void writeAccess(int access, int code, String keyword)
+            throws SAXException {
+        if (isSet(access, code)) {
+            writeKeyword(keyword);
+            xhtml.characters(" ");
+        }
+    }
+
+    private void writeType(Type type) throws SAXException {
+        String name = type.getClassName();
+        if (name.startsWith(packageName + ".")) {
+            xhtml.characters(name.substring(packageName.length() + 1));
+        } else if (name.startsWith("java.lang.")) {
+            xhtml.characters(name.substring("java.lang.".length()));
+        } else {
+            xhtml.characters(name);
+        }
+    }
+
+    private static boolean isSet(int value, int flag) {
+        return (value & flag) != 0;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index 63e4bf6..d17bde7 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -1,142 +1,142 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.code;
-
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import com.uwyn.jhighlight.renderer.Renderer;
-import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
-/**
- * Generic Source code parser for Java, Groovy, C++.
- * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
- *
- * @author Hong-Thai.Nguyen
- * @since 1.6
- */
-public class SourceCodeParser implements Parser {
-
-  private static final long serialVersionUID = -4543476498190054160L;
-
-  private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
-
-  private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
-    private static final long serialVersionUID = -741976157563751152L;
-    {
-      put(MediaType.text("x-c++src"), CPP);
-      put(MediaType.text("x-java-source"), JAVA);
-      put(MediaType.text("x-groovy"), GROOVY);
-    }
-  };
-
-  private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
-  
-  //Parse the HTML document
-  private static final Schema HTML_SCHEMA = new HTMLSchema();
-  
-  @Override
-  public Set<MediaType> getSupportedTypes(ParseContext context) {
-    return TYPES_TO_RENDERER.keySet();
-  }
-
-  @Override
-  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-      throws IOException, SAXException, TikaException {
-
-    try (AutoDetectReader reader = new AutoDetectReader(
-            new CloseShieldInputStream(stream), metadata,
-            context.get(ServiceLoader.class, LOADER))) {
-      Charset charset = reader.getCharset();
-      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
-      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-      if (mediaType != null && name != null) {
-        MediaType type = MediaType.parse(mediaType);
-        metadata.set(Metadata.CONTENT_TYPE, type.toString());
-        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
-        StringBuilder out = new StringBuilder();
-        String line;
-        int nbLines =  0;
-        while ((line = reader.readLine()) != null) {
-            out.append(line + System.getProperty("line.separator"));
-            String author = parserAuthor(line);
-            if (author != null) {
-              metadata.add(TikaCoreProperties.CREATOR, author);
-            }
-            nbLines ++;
-        }
-        metadata.set("LoC", String.valueOf(nbLines));
-        Renderer renderer = getRenderer(type.toString());
-
-        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
-
-        Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
-        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
-        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
-        parser.setContentHandler(handler);
-        parser.parse(new InputSource(new StringReader(codeAsHtml)));
-      }
-    }
-
-  }
-
-  private Renderer getRenderer(String mimeType) {
-    MediaType mt = MediaType.parse(mimeType);
-    String type = TYPES_TO_RENDERER.get(mt);
-    if (type == null) {
-      throw new RuntimeException("unparseable content type " + mimeType);
-    }
-    return XhtmlRendererFactory.getRenderer(type);
-  }
-
-
-  private String parserAuthor(String line) {
-    Matcher m = authorPattern.matcher(line);
-    if (m.find()) {
-      return m.group(1).trim();
-    }
-
-    return null;
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.uwyn.jhighlight.renderer.Renderer;
+import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
+/**
+ * Generic Source code parser for Java, Groovy, C++.
+ * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
+ *
+ * @author Hong-Thai.Nguyen
+ * @since 1.6
+ */
+public class SourceCodeParser implements Parser {
+
+  private static final long serialVersionUID = -4543476498190054160L;
+
+  private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
+
+  private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
+    private static final long serialVersionUID = -741976157563751152L;
+    {
+      put(MediaType.text("x-c++src"), CPP);
+      put(MediaType.text("x-java-source"), JAVA);
+      put(MediaType.text("x-groovy"), GROOVY);
+    }
+  };
+
+  private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
+  
+  //Parse the HTML document
+  private static final Schema HTML_SCHEMA = new HTMLSchema();
+  
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return TYPES_TO_RENDERER.keySet();
+  }
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+      throws IOException, SAXException, TikaException {
+
+    try (AutoDetectReader reader = new AutoDetectReader(
+            new CloseShieldInputStream(stream), metadata,
+            context.get(ServiceLoader.class, LOADER))) {
+      Charset charset = reader.getCharset();
+      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+      if (mediaType != null && name != null) {
+        MediaType type = MediaType.parse(mediaType);
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
+        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+        StringBuilder out = new StringBuilder();
+        String line;
+        int nbLines =  0;
+        while ((line = reader.readLine()) != null) {
+            out.append(line + System.getProperty("line.separator"));
+            String author = parserAuthor(line);
+            if (author != null) {
+              metadata.add(TikaCoreProperties.CREATOR, author);
+            }
+            nbLines ++;
+        }
+        metadata.set("LoC", String.valueOf(nbLines));
+        Renderer renderer = getRenderer(type.toString());
+
+        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+
+        Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+        parser.setContentHandler(handler);
+        parser.parse(new InputSource(new StringReader(codeAsHtml)));
+      }
+    }
+
+  }
+
+  private Renderer getRenderer(String mimeType) {
+    MediaType mt = MediaType.parse(mimeType);
+    String type = TYPES_TO_RENDERER.get(mt);
+    if (type == null) {
+      throw new RuntimeException("unparseable content type " + mimeType);
+    }
+    return XhtmlRendererFactory.getRenderer(type);
+  }
+
+
+  private String parserAuthor(String line) {
+    Matcher m = authorPattern.matcher(line);
+    if (m.find()) {
+      return m.group(1).trim();
+    }
+
+    return null;
+  }
+}

[23/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
index 101b26b..51dc5a5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
@@ -1,327 +1,327 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.util.concurrent.CancellationException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
-import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-public class ChmLzxState implements Cloneable {
-    /* Class' members */
-    private int window; /* the actual decoding window */
-    private long window_size; /* window size (32Kb through 2Mb) */
-    private int window_position; /* current offset within the window */
-    private int main_tree_elements; /* number of main tree elements */
-    private LzxState hadStarted; /* have we started decoding at all yet? */
-    private int block_type; /* type of this block */
-    private int block_length; /* uncompressed length of this block */
-    private int block_remaining; /* uncompressed bytes still left to decode */
-    private int frames_read; /* the number of CFDATA blocks processed */
-    private int intel_file_size; /* magic header value used for transform */
-    private long intel_current_possition; /* current offset in transform space */
-    private IntelState intel_state; /* have we seen any translatable data yet? */
-    private long R0; /* for the LRU offset system */
-    private long R1; /* for the LRU offset system */
-    private long R2; /* for the LRU offset system */
-
-    // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
-    protected short[] mainTreeLengtsTable;
-    protected short[] mainTreeTable;
-
-    protected short[] lengthTreeTable;
-    protected short[] lengthTreeLengtsTable;
-
-    protected short[] alignedLenTable;
-    protected short[] alignedTreeTable;
-
-    @Override
-    public ChmLzxState clone() {
-        try {
-          ChmLzxState clone = (ChmLzxState)super.clone();
-          clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
-          clone.mainTreeTable = arrayClone(mainTreeTable);
-          clone.lengthTreeTable = arrayClone(lengthTreeTable);
-          clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
-          clone.alignedLenTable = arrayClone(alignedLenTable);
-          clone.alignedTreeTable = arrayClone(alignedTreeTable);
-          return clone;
-        } catch (CloneNotSupportedException ex) {
-           return null;
-        }
-    }
-    
-    protected short[] getMainTreeTable() {
-        return mainTreeTable;
-    }
-
-    protected short[] getAlignedTreeTable() {
-        return alignedTreeTable;
-    }
-
-    protected void setAlignedTreeTable(short[] alignedTreeTable) {
-        this.alignedTreeTable = alignedTreeTable;
-    }
-
-    protected short[] getLengthTreeTable() throws TikaException {
-        if (lengthTreeTable != null)
-            return this.lengthTreeTable;
-        else
-            throw new ChmParsingException("lengthTreeTable is null");
-    }
-
-    protected void setLengthTreeTable(short[] lengthTreeTable) {
-        this.lengthTreeTable = lengthTreeTable;
-    }
-
-    protected void setMainTreeTable(short[] mainTreeTable) {
-        this.mainTreeTable = mainTreeTable;
-    }
-
-    protected short[] getAlignedLenTable() {
-        return this.alignedLenTable;
-    }
-
-    protected void setAlignedLenTable(short[] alignedLenTable) {
-        this.alignedLenTable = alignedLenTable;
-    }
-
-    /**
-     * It suits for informative outlook
-     */
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("actual decoding window:=" + getWindow()
-                + System.getProperty("line.separator"));
-        sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
-                + System.getProperty("line.separator"));
-        sb.append("current offset within the window:=" + getWindowPosition()
-                + System.getProperty("line.separator"));
-        sb.append("number of main tree elements:=" + getMainTreeElements()
-                + System.getProperty("line.separator"));
-        sb.append("have we started decoding at all yet?:=" + getHadStarted()
-                + System.getProperty("line.separator"));
-        sb.append("type of this block:=" + getBlockType()
-                + System.getProperty("line.separator"));
-        sb.append("uncompressed length of this block:=" + getBlockLength()
-                + System.getProperty("line.separator"));
-        sb.append("uncompressed bytes still left to decode:="
-                + getBlockRemaining() + System.getProperty("line.separator"));
-        sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
-                + System.getProperty("line.separator"));
-        sb.append("magic header value used for transform:="
-                + getIntelFileSize() + System.getProperty("line.separator"));
-        sb.append("current offset in transform space:="
-                + getIntelCurrentPossition()
-                + System.getProperty("line.separator"));
-        sb.append("have we seen any translatable data yet?:=" + getIntelState()
-                + System.getProperty("line.separator"));
-        sb.append("R0 for the LRU offset system:=" + getR0()
-                + System.getProperty("line.separator"));
-        sb.append("R1 for the LRU offset system:=" + getR1()
-                + System.getProperty("line.separator"));
-        sb.append("R2 for the LRU offset system:=" + getR2()
-                + System.getProperty("line.separator"));
-        sb.append("main tree length:=" + getMainTreeLengtsTable().length
-                + System.getProperty("line.separator"));
-        sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
-                + System.getProperty("line.separator"));
-        return sb.toString();
-    }
-
-    public ChmLzxState(int window) throws TikaException {
-        if (window >= 0) {
-            int position_slots;
-            int win = ChmCommons.getWindowSize(window);
-            setWindowSize(1 << win);
-            /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
-            if (win < 15 || win > 21)
-                throw new ChmParsingException("window less than 15 or window greater than 21");
-
-            /* Calculates required position slots */
-            if (win == 20)
-                position_slots = 42;
-            else if (win == 21)
-                position_slots = 50;
-            else
-                position_slots = win << 1;
-            //TODO: position_slots is not used ?
-            setR0(1);
-            setR1(1);
-            setR2(1);
-            setMainTreeElements(512);
-            setHadStarted(LzxState.NOT_STARTED_DECODING);
-            setFramesRead(0);
-            setBlockRemaining(0);
-            setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
-            setIntelCurrentPossition(0);
-            setIntelState(IntelState.NOT_STARTED);
-            setWindowPosition(0);
-            setMainTreeLengtsTable(new short[getMainTreeElements()]);
-            setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
-        } else
-            throw new CancellationException(
-                    "window size should be more than zero");
-    }
-
-    protected void setWindow(int window) {
-        this.window = window;
-    }
-
-    protected int getWindow() {
-        return window;
-    }
-
-    protected void setWindowSize(long window_size) {
-        this.window_size = window_size;
-    }
-
-    protected long getWindowSize() {
-        return window_size;
-    }
-
-    protected void setWindowPosition(int window_position) {
-        this.window_position = window_position;
-    }
-
-    protected int getWindowPosition() {
-        return window_position;
-    }
-
-    protected void setMainTreeElements(int main_tree_elements) {
-        this.main_tree_elements = main_tree_elements;
-    }
-
-    protected int getMainTreeElements() {
-        return main_tree_elements;
-    }
-
-    protected void setHadStarted(LzxState hadStarted) {
-        this.hadStarted = hadStarted;
-    }
-
-    protected LzxState getHadStarted() {
-        return hadStarted;
-    }
-
-    protected void setBlockType(int block_type) {
-        this.block_type = block_type;
-    }
-
-    public int getBlockType() {
-        return block_type;
-    }
-
-    protected void setBlockLength(int block_length) {
-        this.block_length = block_length;
-    }
-
-    protected int getBlockLength() {
-        return block_length;
-    }
-
-    protected void setBlockRemaining(int block_remaining) {
-        this.block_remaining = block_remaining;
-    }
-
-    protected int getBlockRemaining() {
-        return block_remaining;
-    }
-
-    protected void setFramesRead(int frames_read) {
-        this.frames_read = frames_read;
-    }
-
-    protected void increaseFramesRead() {
-        this.frames_read = getFramesRead() + 1;
-    }
-
-    protected int getFramesRead() {
-        return frames_read;
-    }
-
-    protected void setIntelFileSize(int intel_file_size) {
-        this.intel_file_size = intel_file_size;
-    }
-
-    protected int getIntelFileSize() {
-        return intel_file_size;
-    }
-
-    protected void setIntelCurrentPossition(long intel_current_possition) {
-        this.intel_current_possition = intel_current_possition;
-    }
-
-    protected long getIntelCurrentPossition() {
-        return intel_current_possition;
-    }
-
-    protected void setIntelState(IntelState intel_state) {
-        this.intel_state = intel_state;
-    }
-
-    protected IntelState getIntelState() {
-        return intel_state;
-    }
-
-    protected void setR0(long r0) {
-        R0 = r0;
-    }
-
-    protected long getR0() {
-        return R0;
-    }
-
-    protected void setR1(long r1) {
-        R1 = r1;
-    }
-
-    protected long getR1() {
-        return R1;
-    }
-
-    protected void setR2(long r2) {
-        R2 = r2;
-    }
-
-    protected long getR2() {
-        return R2;
-    }
-
-    public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
-        this.mainTreeLengtsTable = mainTreeLengtsTable;
-    }
-
-    public short[] getMainTreeLengtsTable() {
-        return mainTreeLengtsTable;
-    }
-
-    public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
-        this.lengthTreeLengtsTable = lengthTreeLengtsTable;
-    }
-
-    public short[] getLengthTreeLengtsTable() {
-        return lengthTreeLengtsTable;
-    }
-    
-    private static short[] arrayClone(short[] a) {
-        return a==null ? null : (short[]) a.clone();
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+    /* Class' members */
+    private int window; /* the actual decoding window */
+    private long window_size; /* window size (32Kb through 2Mb) */
+    private int window_position; /* current offset within the window */
+    private int main_tree_elements; /* number of main tree elements */
+    private LzxState hadStarted; /* have we started decoding at all yet? */
+    private int block_type; /* type of this block */
+    private int block_length; /* uncompressed length of this block */
+    private int block_remaining; /* uncompressed bytes still left to decode */
+    private int frames_read; /* the number of CFDATA blocks processed */
+    private int intel_file_size; /* magic header value used for transform */
+    private long intel_current_possition; /* current offset in transform space */
+    private IntelState intel_state; /* have we seen any translatable data yet? */
+    private long R0; /* for the LRU offset system */
+    private long R1; /* for the LRU offset system */
+    private long R2; /* for the LRU offset system */
+
+    // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+    protected short[] mainTreeLengtsTable;
+    protected short[] mainTreeTable;
+
+    protected short[] lengthTreeTable;
+    protected short[] lengthTreeLengtsTable;
+
+    protected short[] alignedLenTable;
+    protected short[] alignedTreeTable;
+
+    @Override
+    public ChmLzxState clone() {
+        try {
+          ChmLzxState clone = (ChmLzxState)super.clone();
+          clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+          clone.mainTreeTable = arrayClone(mainTreeTable);
+          clone.lengthTreeTable = arrayClone(lengthTreeTable);
+          clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+          clone.alignedLenTable = arrayClone(alignedLenTable);
+          clone.alignedTreeTable = arrayClone(alignedTreeTable);
+          return clone;
+        } catch (CloneNotSupportedException ex) {
+           return null;
+        }
+    }
+    
+    protected short[] getMainTreeTable() {
+        return mainTreeTable;
+    }
+
+    protected short[] getAlignedTreeTable() {
+        return alignedTreeTable;
+    }
+
+    protected void setAlignedTreeTable(short[] alignedTreeTable) {
+        this.alignedTreeTable = alignedTreeTable;
+    }
+
+    protected short[] getLengthTreeTable() throws TikaException {
+        if (lengthTreeTable != null)
+            return this.lengthTreeTable;
+        else
+            throw new ChmParsingException("lengthTreeTable is null");
+    }
+
+    protected void setLengthTreeTable(short[] lengthTreeTable) {
+        this.lengthTreeTable = lengthTreeTable;
+    }
+
+    protected void setMainTreeTable(short[] mainTreeTable) {
+        this.mainTreeTable = mainTreeTable;
+    }
+
+    protected short[] getAlignedLenTable() {
+        return this.alignedLenTable;
+    }
+
+    protected void setAlignedLenTable(short[] alignedLenTable) {
+        this.alignedLenTable = alignedLenTable;
+    }
+
+    /**
+     * It suits for informative outlook
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("actual decoding window:=" + getWindow()
+                + System.getProperty("line.separator"));
+        sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+                + System.getProperty("line.separator"));
+        sb.append("current offset within the window:=" + getWindowPosition()
+                + System.getProperty("line.separator"));
+        sb.append("number of main tree elements:=" + getMainTreeElements()
+                + System.getProperty("line.separator"));
+        sb.append("have we started decoding at all yet?:=" + getHadStarted()
+                + System.getProperty("line.separator"));
+        sb.append("type of this block:=" + getBlockType()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed length of this block:=" + getBlockLength()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed bytes still left to decode:="
+                + getBlockRemaining() + System.getProperty("line.separator"));
+        sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+                + System.getProperty("line.separator"));
+        sb.append("magic header value used for transform:="
+                + getIntelFileSize() + System.getProperty("line.separator"));
+        sb.append("current offset in transform space:="
+                + getIntelCurrentPossition()
+                + System.getProperty("line.separator"));
+        sb.append("have we seen any translatable data yet?:=" + getIntelState()
+                + System.getProperty("line.separator"));
+        sb.append("R0 for the LRU offset system:=" + getR0()
+                + System.getProperty("line.separator"));
+        sb.append("R1 for the LRU offset system:=" + getR1()
+                + System.getProperty("line.separator"));
+        sb.append("R2 for the LRU offset system:=" + getR2()
+                + System.getProperty("line.separator"));
+        sb.append("main tree length:=" + getMainTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    public ChmLzxState(int window) throws TikaException {
+        if (window >= 0) {
+            int position_slots;
+            int win = ChmCommons.getWindowSize(window);
+            setWindowSize(1 << win);
+            /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+            if (win < 15 || win > 21)
+                throw new ChmParsingException("window less than 15 or window greater than 21");
+
+            /* Calculates required position slots */
+            if (win == 20)
+                position_slots = 42;
+            else if (win == 21)
+                position_slots = 50;
+            else
+                position_slots = win << 1;
+            //TODO: position_slots is not used ?
+            setR0(1);
+            setR1(1);
+            setR2(1);
+            setMainTreeElements(512);
+            setHadStarted(LzxState.NOT_STARTED_DECODING);
+            setFramesRead(0);
+            setBlockRemaining(0);
+            setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+            setIntelCurrentPossition(0);
+            setIntelState(IntelState.NOT_STARTED);
+            setWindowPosition(0);
+            setMainTreeLengtsTable(new short[getMainTreeElements()]);
+            setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+        } else
+            throw new CancellationException(
+                    "window size should be more than zero");
+    }
+
+    protected void setWindow(int window) {
+        this.window = window;
+    }
+
+    protected int getWindow() {
+        return window;
+    }
+
+    protected void setWindowSize(long window_size) {
+        this.window_size = window_size;
+    }
+
+    protected long getWindowSize() {
+        return window_size;
+    }
+
+    protected void setWindowPosition(int window_position) {
+        this.window_position = window_position;
+    }
+
+    protected int getWindowPosition() {
+        return window_position;
+    }
+
+    protected void setMainTreeElements(int main_tree_elements) {
+        this.main_tree_elements = main_tree_elements;
+    }
+
+    protected int getMainTreeElements() {
+        return main_tree_elements;
+    }
+
+    protected void setHadStarted(LzxState hadStarted) {
+        this.hadStarted = hadStarted;
+    }
+
+    protected LzxState getHadStarted() {
+        return hadStarted;
+    }
+
+    protected void setBlockType(int block_type) {
+        this.block_type = block_type;
+    }
+
+    public int getBlockType() {
+        return block_type;
+    }
+
+    protected void setBlockLength(int block_length) {
+        this.block_length = block_length;
+    }
+
+    protected int getBlockLength() {
+        return block_length;
+    }
+
+    protected void setBlockRemaining(int block_remaining) {
+        this.block_remaining = block_remaining;
+    }
+
+    protected int getBlockRemaining() {
+        return block_remaining;
+    }
+
+    protected void setFramesRead(int frames_read) {
+        this.frames_read = frames_read;
+    }
+
+    protected void increaseFramesRead() {
+        this.frames_read = getFramesRead() + 1;
+    }
+
+    protected int getFramesRead() {
+        return frames_read;
+    }
+
+    protected void setIntelFileSize(int intel_file_size) {
+        this.intel_file_size = intel_file_size;
+    }
+
+    protected int getIntelFileSize() {
+        return intel_file_size;
+    }
+
+    protected void setIntelCurrentPossition(long intel_current_possition) {
+        this.intel_current_possition = intel_current_possition;
+    }
+
+    protected long getIntelCurrentPossition() {
+        return intel_current_possition;
+    }
+
+    protected void setIntelState(IntelState intel_state) {
+        this.intel_state = intel_state;
+    }
+
+    protected IntelState getIntelState() {
+        return intel_state;
+    }
+
+    protected void setR0(long r0) {
+        R0 = r0;
+    }
+
+    protected long getR0() {
+        return R0;
+    }
+
+    protected void setR1(long r1) {
+        R1 = r1;
+    }
+
+    protected long getR1() {
+        return R1;
+    }
+
+    protected void setR2(long r2) {
+        R2 = r2;
+    }
+
+    protected long getR2() {
+        return R2;
+    }
+
+    public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+        this.mainTreeLengtsTable = mainTreeLengtsTable;
+    }
+
+    public short[] getMainTreeLengtsTable() {
+        return mainTreeLengtsTable;
+    }
+
+    public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+        this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+    }
+
+    public short[] getLengthTreeLengtsTable() {
+        return lengthTreeLengtsTable;
+    }
+    
+    private static short[] arrayClone(short[] a) {
+        return a==null ? null : (short[]) a.clone();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
index c8944be..77f9b3a 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
@@ -1,222 +1,222 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-
-public class ChmSection {
-    final private byte[] data;
-    final private byte[] prevcontent;
-    private int swath;// kiks
-    private int total;// remains
-    private int buffer;// val
-
-    public ChmSection(byte[] data) throws TikaException {
-        this(data, null);
-    }
-
-    public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
-        ChmCommons.assertByteArrayNotNull(data);
-        this.data = data;
-        this.prevcontent = prevconent;
-        //setData(data);
-    }
-    
-    /* Utilities */
-    public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
-        ChmCommons.assertByteArrayNotNull(toBeReversed);
-        ChmCommons.reverse(toBeReversed);
-        return toBeReversed;
-    }
-
-    public int checkBit(int i) {
-        return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
-    }
-
-    public int getSyncBits(int bit) {
-        return getDesyncBits(bit, bit);
-    }
-
-    public int peekBits(int bit) {
-        return getDesyncBits(bit, 0);
-    }
-    
-    private int getDesyncBits(int bit, int removeBit) {
-        while (getTotal() < 16) {
-            setBuffer((getBuffer() << 16) + unmarshalUByte()
-                    + (unmarshalUByte() << 8));
-            setTotal(getTotal() + 16);
-        }
-        int tmp = (getBuffer() >>> (getTotal() - bit));
-        setTotal(getTotal() - removeBit);
-        setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
-        return tmp;
-    }
-
-    public int unmarshalUByte() {
-        return getByte() & 255;
-    }
-
-    public byte getByte() {
-        if (getSwath() < getData().length) {
-            setSwath(getSwath() + 1);
-            return getData()[getSwath() - 1];
-        } else
-            return 0;
-    }
-
-    public int getLeft() {
-        return (getData().length - getSwath());
-    }
-
-    public byte[] getData() {
-        return data;
-    }
-
-    public byte[] getPrevContent() {
-        return prevcontent;
-    }
-    
-    public BigInteger getBigInteger(int i) {
-        if (getData() == null)
-            return BigInteger.ZERO;
-        if (getData().length - getSwath() < i)
-            i = getData().length - getSwath();
-        byte[] tmp = new byte[i];
-        for (int j = i - 1; j >= 0; j--) {
-            tmp[i - j - 1] = getData()[getSwath() + j];
-        }
-        setSwath(getSwath() + i);
-        return new BigInteger(tmp);
-    }
-
-    public byte[] stringToAsciiBytes(String s) {
-        char[] c = s.toCharArray();
-        byte[] byteval = new byte[c.length];
-        for (int i = 0; i < c.length; i++)
-            byteval[i] = (byte) c[i];
-        return byteval;
-    }
-
-    public BigInteger unmarshalUlong() {
-        return getBigInteger(8);
-    }
-
-    public long unmarshalUInt() {
-        return getBigInteger(4).longValue();
-    }
-
-    public int unmarshalInt() {
-        return getBigInteger(4).intValue();
-    }
-
-    public byte[] unmarshalBytes(int i) {
-        if (i == 0)
-            return new byte[1];
-        byte[] t = new byte[i];
-        for (int j = 0; j < i; j++)
-            t[j] = getData()[j + getSwath()];
-        setSwath(getSwath() + i);
-        return t;
-    }
-
-    public BigInteger getEncint() {
-        byte ob;
-        BigInteger bi = BigInteger.ZERO;
-        byte[] nb = new byte[1];
-        while ((ob = this.getByte()) < 0) {
-            nb[0] = (byte) ((ob & 0x7f));
-            bi = bi.shiftLeft(7).add(new BigInteger(nb));
-        }
-        nb[0] = (byte) ((ob & 0x7f));
-        bi = bi.shiftLeft(7).add(new BigInteger(nb));
-        return bi;
-    }
-
-    public char unmarshalUtfChar() {
-        byte ob;
-        int i = 1;
-        byte[] ba;
-        ob = this.getByte();
-        if (ob < 0) {
-            i = 2;
-            while ((ob << (24 + i)) < 0)
-                i++;
-        }
-        ba = new byte[i];
-        ba[0] = ob;
-        int j = 1;
-        while (j < i) {
-            ba[j] = this.getByte();
-            j++;
-        }
-        i = ba.length;
-        if (i == 1)
-            return (char) ba[0];
-        else {
-            int n;
-            n = ba[0] & 15; // 00001111b, gets last 4 bits
-            j = 1;
-            while (j < i)
-                n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
-            return (char) n;
-        }
-    }
-
-//    private void setData(byte[] data) {
-//        this.data = data;
-//    }
-
-    public int getSwath() {
-        return swath;
-    }
-
-    public void setSwath(int swath) {
-        this.swath = swath;
-    }
-
-    public int getTotal() {
-        return total;
-    }
-
-    public void setTotal(int total) {
-        this.total = total;
-    }
-
-    private int getBuffer() {
-        return buffer;
-    }
-
-    private void setBuffer(int buffer) {
-        this.buffer = buffer;
-    }
-
-    /**
-     * @param args
-     * @throws TikaException 
-     */
-    public static void main(String[] args) throws TikaException {
-        byte[] array = { 4, 78, -67, 90, 1, -33 };
-        ChmSection chmSection = new ChmSection(array);
-        System.out.println("before " + Arrays.toString(array));
-        System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+    final private byte[] data;
+    final private byte[] prevcontent;
+    private int swath;// kiks
+    private int total;// remains
+    private int buffer;// val
+
+    public ChmSection(byte[] data) throws TikaException {
+        this(data, null);
+    }
+
+    public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(data);
+        this.data = data;
+        this.prevcontent = prevconent;
+        //setData(data);
+    }
+    
+    /* Utilities */
+    public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(toBeReversed);
+        ChmCommons.reverse(toBeReversed);
+        return toBeReversed;
+    }
+
+    public int checkBit(int i) {
+        return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+    }
+
+    public int getSyncBits(int bit) {
+        return getDesyncBits(bit, bit);
+    }
+
+    public int peekBits(int bit) {
+        return getDesyncBits(bit, 0);
+    }
+    
+    private int getDesyncBits(int bit, int removeBit) {
+        while (getTotal() < 16) {
+            setBuffer((getBuffer() << 16) + unmarshalUByte()
+                    + (unmarshalUByte() << 8));
+            setTotal(getTotal() + 16);
+        }
+        int tmp = (getBuffer() >>> (getTotal() - bit));
+        setTotal(getTotal() - removeBit);
+        setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+        return tmp;
+    }
+
+    public int unmarshalUByte() {
+        return getByte() & 255;
+    }
+
+    public byte getByte() {
+        if (getSwath() < getData().length) {
+            setSwath(getSwath() + 1);
+            return getData()[getSwath() - 1];
+        } else
+            return 0;
+    }
+
+    public int getLeft() {
+        return (getData().length - getSwath());
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    public byte[] getPrevContent() {
+        return prevcontent;
+    }
+    
+    public BigInteger getBigInteger(int i) {
+        if (getData() == null)
+            return BigInteger.ZERO;
+        if (getData().length - getSwath() < i)
+            i = getData().length - getSwath();
+        byte[] tmp = new byte[i];
+        for (int j = i - 1; j >= 0; j--) {
+            tmp[i - j - 1] = getData()[getSwath() + j];
+        }
+        setSwath(getSwath() + i);
+        return new BigInteger(tmp);
+    }
+
+    public byte[] stringToAsciiBytes(String s) {
+        char[] c = s.toCharArray();
+        byte[] byteval = new byte[c.length];
+        for (int i = 0; i < c.length; i++)
+            byteval[i] = (byte) c[i];
+        return byteval;
+    }
+
+    public BigInteger unmarshalUlong() {
+        return getBigInteger(8);
+    }
+
+    public long unmarshalUInt() {
+        return getBigInteger(4).longValue();
+    }
+
+    public int unmarshalInt() {
+        return getBigInteger(4).intValue();
+    }
+
+    public byte[] unmarshalBytes(int i) {
+        if (i == 0)
+            return new byte[1];
+        byte[] t = new byte[i];
+        for (int j = 0; j < i; j++)
+            t[j] = getData()[j + getSwath()];
+        setSwath(getSwath() + i);
+        return t;
+    }
+
+    public BigInteger getEncint() {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+        while ((ob = this.getByte()) < 0) {
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        }
+        nb[0] = (byte) ((ob & 0x7f));
+        bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        return bi;
+    }
+
+    public char unmarshalUtfChar() {
+        byte ob;
+        int i = 1;
+        byte[] ba;
+        ob = this.getByte();
+        if (ob < 0) {
+            i = 2;
+            while ((ob << (24 + i)) < 0)
+                i++;
+        }
+        ba = new byte[i];
+        ba[0] = ob;
+        int j = 1;
+        while (j < i) {
+            ba[j] = this.getByte();
+            j++;
+        }
+        i = ba.length;
+        if (i == 1)
+            return (char) ba[0];
+        else {
+            int n;
+            n = ba[0] & 15; // 00001111b, gets last 4 bits
+            j = 1;
+            while (j < i)
+                n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+            return (char) n;
+        }
+    }
+
+//    private void setData(byte[] data) {
+//        this.data = data;
+//    }
+
+    public int getSwath() {
+        return swath;
+    }
+
+    public void setSwath(int swath) {
+        this.swath = swath;
+    }
+
+    public int getTotal() {
+        return total;
+    }
+
+    public void setTotal(int total) {
+        this.total = total;
+    }
+
+    private int getBuffer() {
+        return buffer;
+    }
+
+    private void setBuffer(int buffer) {
+        this.buffer = buffer;
+    }
+
+    /**
+     * @param args
+     * @throws TikaException 
+     */
+    public static void main(String[] args) throws TikaException {
+        byte[] array = { 4, 78, -67, 90, 1, -33 };
+        ChmSection chmSection = new ChmSection(array);
+        System.out.println("before " + Arrays.toString(array));
+        System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 0e0e3da..86b1dd4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -1,209 +1,209 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
- * DelegatingParser to process each mail.
- */
-public class MboxParser extends AbstractParser {
-
-    public static final String MBOX_MIME_TYPE = "application/mbox";
-    public static final String MBOX_RECORD_DIVIDER = "From ";
-    public static final int MAIL_MAX_SIZE = 50000000;
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -1762689436731160661L;
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
-    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
-    private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
-
-    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
-    private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
-    private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
-    private boolean tracking = false;
-
-    public static Date parseDate(String headerContent) throws ParseException {
-        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
-        return dateFormat.parse(headerContent);
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, TikaException, SAXException {
-
-        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
-                new ParsingEmbeddedDocumentExtractor(context));
-
-        String charsetName = "windows-1252";
-
-        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
-        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        InputStreamReader isr = new InputStreamReader(stream, charsetName);
-        try (BufferedReader reader = new BufferedReader(isr)) {
-            String curLine = reader.readLine();
-            int mailItem = 0;
-            do {
-                if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
-                    Metadata mailMetadata = new Metadata();
-                    Queue<String> multiline = new LinkedList<String>();
-                    mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
-                    mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
-                    curLine = reader.readLine();
-
-                    ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
-                    do {
-                        if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
-                            String latestLine = multiline.poll();
-                            latestLine += " " + curLine.trim();
-                            multiline.add(latestLine);
-                        } else {
-                            multiline.add(curLine);
-                        }
-
-                        message.write(curLine.getBytes(charsetName));
-                        message.write(0x0A);
-                        curLine = reader.readLine();
-                    }
-                    while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
-
-                    for (String item : multiline) {
-                        saveHeaderInMetadata(mailMetadata, item);
-                    }
-
-                    ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
-                    message = null;
-
-                    if (extractor.shouldParseEmbedded(mailMetadata)) {
-                        extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
-                    }
-
-                    if (tracking) {
-                        getTrackingMetadata().put(mailItem++, mailMetadata);
-                    }
-                } else {
-                    curLine = reader.readLine();
-                }
-
-            } while (curLine != null && !Thread.currentThread().isInterrupted());
-        }
-
-        xhtml.endDocument();
-    }
-
-    public boolean isTracking() {
-        return tracking;
-    }
-
-    public void setTracking(boolean tracking) {
-        this.tracking = tracking;
-    }
-
-    public Map<Integer, Metadata> getTrackingMetadata() {
-        return trackingMetadata;
-    }
-
-    private void saveHeaderInMetadata(Metadata metadata, String curLine) {
-        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
-        if (!headerMatcher.matches()) {
-            return; // ignore malformed header lines
-        }
-
-        String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
-        String headerContent = headerMatcher.group(2);
-
-        if (headerTag.equalsIgnoreCase("From")) {
-            metadata.set(TikaCoreProperties.CREATOR, headerContent);
-        } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
-                || headerTag.equalsIgnoreCase("Bcc")) {
-            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
-            if (address.find()) {
-                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
-            } else if (headerContent.indexOf('@') > -1) {
-                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
-            }
-
-            String property = Metadata.MESSAGE_TO;
-            if (headerTag.equalsIgnoreCase("Cc")) {
-                property = Metadata.MESSAGE_CC;
-            } else if (headerTag.equalsIgnoreCase("Bcc")) {
-                property = Metadata.MESSAGE_BCC;
-            }
-            metadata.add(property, headerContent);
-        } else if (headerTag.equalsIgnoreCase("Subject")) {
-            metadata.add(Metadata.SUBJECT, headerContent);
-        } else if (headerTag.equalsIgnoreCase("Date")) {
-            try {
-                Date date = parseDate(headerContent);
-                metadata.set(TikaCoreProperties.CREATED, date);
-            } catch (ParseException e) {
-                // ignoring date because format was not understood
-            }
-        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
-            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
-        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
-            metadata.set(TikaCoreProperties.RELATION, headerContent);
-        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
-            // TODO - key off content-type in headers to
-            // set mapping to use for content and convert if necessary.
-
-            metadata.add(Metadata.CONTENT_TYPE, headerContent);
-            metadata.set(TikaCoreProperties.FORMAT, headerContent);
-        } else {
-            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+    public static final String MBOX_MIME_TYPE = "application/mbox";
+    public static final String MBOX_RECORD_DIVIDER = "From ";
+    public static final int MAIL_MAX_SIZE = 50000000;
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1762689436731160661L;
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+    private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+    private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+    private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+    private boolean tracking = false;
+
+    public static Date parseDate(String headerContent) throws ParseException {
+        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+        return dateFormat.parse(headerContent);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+
+        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        String charsetName = "windows-1252";
+
+        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        InputStreamReader isr = new InputStreamReader(stream, charsetName);
+        try (BufferedReader reader = new BufferedReader(isr)) {
+            String curLine = reader.readLine();
+            int mailItem = 0;
+            do {
+                if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+                    Metadata mailMetadata = new Metadata();
+                    Queue<String> multiline = new LinkedList<String>();
+                    mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+                    mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+                    curLine = reader.readLine();
+
+                    ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+                    do {
+                        if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+                            String latestLine = multiline.poll();
+                            latestLine += " " + curLine.trim();
+                            multiline.add(latestLine);
+                        } else {
+                            multiline.add(curLine);
+                        }
+
+                        message.write(curLine.getBytes(charsetName));
+                        message.write(0x0A);
+                        curLine = reader.readLine();
+                    }
+                    while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+                    for (String item : multiline) {
+                        saveHeaderInMetadata(mailMetadata, item);
+                    }
+
+                    ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+                    message = null;
+
+                    if (extractor.shouldParseEmbedded(mailMetadata)) {
+                        extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+                    }
+
+                    if (tracking) {
+                        getTrackingMetadata().put(mailItem++, mailMetadata);
+                    }
+                } else {
+                    curLine = reader.readLine();
+                }
+
+            } while (curLine != null && !Thread.currentThread().isInterrupted());
+        }
+
+        xhtml.endDocument();
+    }
+
+    public boolean isTracking() {
+        return tracking;
+    }
+
+    public void setTracking(boolean tracking) {
+        this.tracking = tracking;
+    }
+
+    public Map<Integer, Metadata> getTrackingMetadata() {
+        return trackingMetadata;
+    }
+
+    private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+        if (!headerMatcher.matches()) {
+            return; // ignore malformed header lines
+        }
+
+        String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+        String headerContent = headerMatcher.group(2);
+
+        if (headerTag.equalsIgnoreCase("From")) {
+            metadata.set(TikaCoreProperties.CREATOR, headerContent);
+        } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+                || headerTag.equalsIgnoreCase("Bcc")) {
+            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+            if (address.find()) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+            } else if (headerContent.indexOf('@') > -1) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+            }
+
+            String property = Metadata.MESSAGE_TO;
+            if (headerTag.equalsIgnoreCase("Cc")) {
+                property = Metadata.MESSAGE_CC;
+            } else if (headerTag.equalsIgnoreCase("Bcc")) {
+                property = Metadata.MESSAGE_BCC;
+            }
+            metadata.add(property, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Date")) {
+            try {
+                Date date = parseDate(headerContent);
+                metadata.set(TikaCoreProperties.CREATED, date);
+            } catch (ParseException e) {
+                // ignoring date because format was not understood
+            }
+        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+            metadata.set(TikaCoreProperties.RELATION, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+            // TODO - key off content-type in headers to
+            // set mapping to use for content and convert if necessary.
+
+            metadata.add(Metadata.CONTENT_TYPE, headerContent);
+            metadata.set(TikaCoreProperties.FORMAT, headerContent);
+        } else {
+            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index f7eec91..5883bd5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -1,203 +1,203 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static java.lang.String.valueOf;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.Collections.singleton;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import com.pff.PSTAttachment;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Parser for MS Outlook PST email storage files
- */
-public class OutlookPSTParser extends AbstractParser {
-
-    private static final long serialVersionUID = 620998217748364063L;
-
-    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
-    private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
-
-    private static AttributesImpl createAttribute(String attName, String attValue) {
-        AttributesImpl attributes = new AttributesImpl();
-        attributes.addAttribute("", attName, attName, "CDATA", attValue);
-        return attributes;
-    }
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-        // Use the delegate parser to parse the contained document
-        EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
-                new ParsingEmbeddedDocumentExtractor(context));
-
-        metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        TikaInputStream in = TikaInputStream.get(stream);
-        PSTFile pstFile = null;
-        try {
-            pstFile = new PSTFile(in.getFile().getPath());
-            metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
-            boolean isValid = pstFile.getFileHandle().getFD().valid();
-            metadata.set("isValid", valueOf(isValid));
-            if (isValid) {
-                parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
-            }
-        } catch (Exception e) {
-            throw new TikaException(e.getMessage(), e);
-        } finally {
-            if (pstFile != null && pstFile.getFileHandle() != null) {
-                try {
-                    pstFile.getFileHandle().close();
-                } catch (IOException e) {
-                    //swallow closing exception
-                }
-            }
-        }
-
-        xhtml.endDocument();
-    }
-
-    private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
-            throws Exception {
-        if (pstFolder.getContentCount() > 0) {
-            PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
-            while (pstMail != null) {
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
-                handler.startElement("div", attributes);
-                handler.element("h1", pstMail.getSubject());
-
-                parserMailItem(handler, pstMail, embeddedExtractor);
-                parseMailAttachments(handler, pstMail, embeddedExtractor);
-
-                handler.endElement("div");
-
-                pstMail = (PSTMessage) pstFolder.getNextChild();
-            }
-        }
-
-        if (pstFolder.hasSubfolders()) {
-            for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
-                handler.startElement("div", createAttribute("class", "email-folder"));
-                handler.element("h1", pstSubFolder.getDisplayName());
-                parseFolder(handler, pstSubFolder, embeddedExtractor);
-                handler.endElement("div");
-            }
-        }
-    }
-
-    private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
-        Metadata mailMetadata = new Metadata();
-        mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
-        mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
-        mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
-        mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
-        mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
-        mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
-        mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
-        mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
-        mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
-        mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
-        mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
-        mailMetadata.set("recipients", pstMail.getRecipientsString());
-        mailMetadata.set("displayTo", pstMail.getDisplayTo());
-        mailMetadata.set("displayCC", pstMail.getDisplayCC());
-        mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
-        mailMetadata.set("importance", valueOf(pstMail.getImportance()));
-        mailMetadata.set("priority", valueOf(pstMail.getPriority()));
-        mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
-
-        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
-        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
-    }
-
-    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
-            throws TikaException {
-        int numberOfAttachments = email.getNumberOfAttachments();
-        for (int i = 0; i < numberOfAttachments; i++) {
-            File tempFile = null;
-            try {
-                PSTAttachment attach = email.getAttachment(i);
-
-                // Get the filename; both long and short filenames can be used for attachments
-                String filename = attach.getLongFilename();
-                if (filename.isEmpty()) {
-                    filename = attach.getFilename();
-                }
-
-                xhtml.element("p", filename);
-
-                Metadata attachMeta = new Metadata();
-                attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
-                attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", filename);
-                xhtml.startElement("div", attributes);
-                if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
-                    TemporaryResources tmp = new TemporaryResources();
-                    try {
-                        TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
-                        embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
-                    } finally {
-                        tmp.dispose();
-                    }
-                }
-                xhtml.endElement("div");
-
-            } catch (Exception e) {
-                throw new TikaException("Unable to unpack document stream", e);
-            } finally {
-                if (tempFile != null)
-                    tempFile.delete();
-            }
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+    private static final long serialVersionUID = 620998217748364063L;
+
+    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+    private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+    private static AttributesImpl createAttribute(String attName, String attValue) {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", attName, attName, "CDATA", attValue);
+        return attributes;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        TikaInputStream in = TikaInputStream.get(stream);
+        PSTFile pstFile = null;
+        try {
+            pstFile = new PSTFile(in.getFile().getPath());
+            metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+            boolean isValid = pstFile.getFileHandle().getFD().valid();
+            metadata.set("isValid", valueOf(isValid));
+            if (isValid) {
+                parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+            }
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage(), e);
+        } finally {
+            if (pstFile != null && pstFile.getFileHandle() != null) {
+                try {
+                    pstFile.getFileHandle().close();
+                } catch (IOException e) {
+                    //swallow closing exception
+                }
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+            throws Exception {
+        if (pstFolder.getContentCount() > 0) {
+            PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+            while (pstMail != null) {
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+                handler.startElement("div", attributes);
+                handler.element("h1", pstMail.getSubject());
+
+                parserMailItem(handler, pstMail, embeddedExtractor);
+                parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+                handler.endElement("div");
+
+                pstMail = (PSTMessage) pstFolder.getNextChild();
+            }
+        }
+
+        if (pstFolder.hasSubfolders()) {
+            for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+                handler.startElement("div", createAttribute("class", "email-folder"));
+                handler.element("h1", pstSubFolder.getDisplayName());
+                parseFolder(handler, pstSubFolder, embeddedExtractor);
+                handler.endElement("div");
+            }
+        }
+    }
+
+    private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+        Metadata mailMetadata = new Metadata();
+        mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+        mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+        mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+        mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+        mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+        mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+        mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+        mailMetadata.set("recipients", pstMail.getRecipientsString());
+        mailMetadata.set("displayTo", pstMail.getDisplayTo());
+        mailMetadata.set("displayCC", pstMail.getDisplayCC());
+        mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+        mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+        mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+        mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+    }
+
+    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+            throws TikaException {
+        int numberOfAttachments = email.getNumberOfAttachments();
+        for (int i = 0; i < numberOfAttachments; i++) {
+            File tempFile = null;
+            try {
+                PSTAttachment attach = email.getAttachment(i);
+
+                // Get the filename; both long and short filenames can be used for attachments
+                String filename = attach.getLongFilename();
+                if (filename.isEmpty()) {
+                    filename = attach.getFilename();
+                }
+
+                xhtml.element("p", filename);
+
+                Metadata attachMeta = new Metadata();
+                attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+                attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", filename);
+                xhtml.startElement("div", attributes);
+                if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+                        embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+                    } finally {
+                        tmp.dispose();
+                    }
+                }
+                xhtml.endElement("div");
+
+            } catch (Exception e) {
+                throw new TikaException("Unable to unpack document stream", e);
+            } finally {
+                if (tempFile != null)
+                    tempFile.delete();
+            }
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
index 36439b8..fa932a6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
@@ -1,99 +1,99 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that:<ul>
- * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
- * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
- * </ul>
- */
-public class NSNormalizerContentHandler extends ContentHandlerDecorator {
-
-    private static final String OLD_NS =
-            "http://openoffice.org/2000/";
-
-    private static final String NEW_NS =
-            "urn:oasis:names:tc:opendocument:xmlns:";
-
-    private static final String DTD_PUBLIC_ID =
-            "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
-
-    public NSNormalizerContentHandler(ContentHandler handler) {
-        super(handler);
-    }
-
-    private String mapOldNS(String ns) {
-        if (ns != null && ns.startsWith(OLD_NS)) {
-            return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
-        } else {
-            return ns;
-        }
-    }
-
-    @Override
-    public void startElement(
-            String namespaceURI, String localName, String qName,
-            Attributes atts) throws SAXException {
-        AttributesImpl natts = new AttributesImpl();
-        for (int i = 0; i < atts.getLength(); i++) {
-            natts.addAttribute(
-                    mapOldNS(atts.getURI(i)), atts.getLocalName(i),
-                    atts.getQName(i), atts.getType(i), atts.getValue(i));
-        }
-        super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
-    }
-
-    @Override
-    public void endElement(String namespaceURI, String localName, String qName)
-            throws SAXException {
-        super.endElement(mapOldNS(namespaceURI), localName, qName);
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri)
-            throws SAXException {
-        super.startPrefixMapping(prefix, mapOldNS(uri));
-    }
-
-    /**
-     * do not load any DTDs (may be requested by parser). Fake the DTD by
-     * returning a empty string as InputSource
-     */
-    @Override
-    public InputSource resolveEntity(String publicId, String systemId)
-            throws IOException, SAXException {
-        if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
-                || DTD_PUBLIC_ID.equals(publicId)) {
-            return new InputSource(new StringReader(""));
-        } else {
-            return super.resolveEntity(publicId, systemId);
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+    private static final String OLD_NS =
+            "http://openoffice.org/2000/";
+
+    private static final String NEW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:";
+
+    private static final String DTD_PUBLIC_ID =
+            "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+    public NSNormalizerContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    private String mapOldNS(String ns) {
+        if (ns != null && ns.startsWith(OLD_NS)) {
+            return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+        } else {
+            return ns;
+        }
+    }
+
+    @Override
+    public void startElement(
+            String namespaceURI, String localName, String qName,
+            Attributes atts) throws SAXException {
+        AttributesImpl natts = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            natts.addAttribute(
+                    mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+                    atts.getQName(i), atts.getType(i), atts.getValue(i));
+        }
+        super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+    }
+
+    @Override
+    public void endElement(String namespaceURI, String localName, String qName)
+            throws SAXException {
+        super.endElement(mapOldNS(namespaceURI), localName, qName);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri)
+            throws SAXException {
+        super.startPrefixMapping(prefix, mapOldNS(uri));
+    }
+
+    /**
+     * do not load any DTDs (may be requested by parser). Fake the DTD by
+     * returning a empty string as InputSource
+     */
+    @Override
+    public InputSource resolveEntity(String publicId, String systemId)
+            throws IOException, SAXException {
+        if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+                || DTD_PUBLIC_ID.equals(publicId)) {
+            return new InputSource(new StringReader(""));
+        } else {
+            return super.resolveEntity(publicId, systemId);
+        }
+    }
+
+}

[29/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
index 4105dfa..1c615f6 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
@@ -1,414 +1,414 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing mp3 files.
- */
-public class Mp3ParserTest {
-
-    /**
-     * Checks the duration of an MP3 file.
-     * @param metadata the metadata object
-     * @param expected the expected duration, rounded as seconds
-     */
-    private static void checkDuration(Metadata metadata, int expected) {
-        assertEquals("Wrong duration", expected,
-                Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
-    }
-
-    /**
-     * Test that with only ID3v1 tags, we get some information out   
-     */
-    @Test
-    public void testMp3ParsingID3v1() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3id3v1.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
-        String content = handler.toString();
-        assertContains("Test Title", content);
-        assertContains("Test Artist", content);
-        assertContains("Test Album", content);
-        assertContains("2008", content);
-        assertContains("Test Comment", content);
-        assertContains("Rock", content);
-        
-        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-        assertEquals("44100", metadata.get("samplerate"));
-        assertEquals("1", metadata.get("channels"));
-        checkDuration(metadata, 2);
-    }
-
-    /**
-     * Test that with only ID3v2 tags, we get the full
-     *  set of information out.
-     */
-    @Test
-    public void testMp3ParsingID3v2() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3id3v2.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // Check core properties
-        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
-        // Check the textual contents
-        String content = handler.toString();
-        assertContains("Test Title", content);
-        assertContains("Test Artist", content);
-        assertContains("Test Album", content);
-        assertContains("2008", content);
-        assertContains("Test Comment", content);
-        assertContains("Rock", content);
-        assertContains(", track 1", content);
-        assertContains(", disc 1", content);
-        
-        // Check un-typed audio properties
-        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-        assertEquals("44100", metadata.get("samplerate"));
-        assertEquals("1", metadata.get("channels"));
-        
-        // Check XMPDM-typed audio properties
-        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
-        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
-        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
-        assertEquals(null, metadata.get(XMPDM.COMPOSER));
-        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
-        assertEquals("Rock", metadata.get(XMPDM.GENRE));
-        assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
-        assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
-        assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
-        assertEquals("1", metadata.get(XMPDM.COMPILATION));
-        
-        assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
-        assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
-        assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
-        checkDuration(metadata, 2);
-    }
-
-    /**
-     * Test that with both id3v2 and id3v1, we prefer the
-     *  details from id3v2
-     */
-    @Test
-    public void testMp3ParsingID3v1v2() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3id3v1_v2.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
-        String content = handler.toString();
-        assertContains("Test Title", content);
-        assertContains("Test Artist", content);
-        assertContains("Test Album", content);
-        assertContains("2008", content);
-        assertContains("Test Comment", content);
-        assertContains("Rock", content);
-        
-        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-        assertEquals("44100", metadata.get("samplerate"));
-        assertEquals("1", metadata.get("channels"));
-        checkDuration(metadata, 2);
-    }
-
-    /**
-     * Test that with only ID3v2 tags, of version 2.4, we get the full
-     *  set of information out.
-     */
-    @Test
-    public void testMp3ParsingID3v24() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3id3v24.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
-        String content = handler.toString();
-        assertContains("Test Title", content);
-        assertContains("Test Artist", content);
-        assertContains("Test Album", content);
-        assertContains("2008", content);
-        assertContains("Test Comment", content);
-        assertContains("Rock", content);
-        assertContains(", disc 1", content);
-        
-        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-        assertEquals("44100", metadata.get("samplerate"));
-        assertEquals("1", metadata.get("channels"));
-        checkDuration(metadata, 2);
-
-        // Check XMPDM-typed audio properties
-        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
-        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
-        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
-        assertEquals(null, metadata.get(XMPDM.COMPOSER));
-        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
-        assertEquals("Rock", metadata.get(XMPDM.GENRE));
-        assertEquals("1", metadata.get(XMPDM.COMPILATION));
-        
-        assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
-        assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
-    }
-    
-    /**
-     * Tests that a file with characters not in the ISO 8859-1
-     *  range is correctly handled
-     */
-    @Test
-    public void testMp3ParsingID3i18n() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3i18n.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("Test Artist \u2468\u2460", metadata.get(Metadata.AUTHOR));
-       assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
-       assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
-
-       assertEquals(
-             "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment", 
-             metadata.get(XMPDM.LOG_COMMENT)
-       );
-       
-       assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-       assertEquals("44100", metadata.get("samplerate"));
-       assertEquals("1", metadata.get("channels"));
-       checkDuration(metadata, 2);
-   }
-    
-    
-    /**
-     * Tests that a file with both lyrics and
-     *  ID3v2 tags gets both extracted correctly
-     */
-    @Test
-    public void testMp3ParsingLyrics() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        // Note - our test file has a lyrics tag, but lacks any
-        //  lyrics in the tags, so we can't test that bit
-        // TODO Find a better sample file
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3lyrics.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
-        String content = handler.toString();
-        assertContains("Test Title", content);
-        assertContains("Test Artist", content);
-        assertContains("Test Album", content);
-        assertContains("2008", content);
-        assertContains("Test Comment", content);
-        assertContains("Rock", content);
-        
-        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-        assertEquals("44100", metadata.get("samplerate"));
-        assertEquals("2", metadata.get("channels"));
-        checkDuration(metadata, 1);
-    }
-    
-    @Test
-    public void testID3v2Frame() throws Exception {
-       byte[] empty = new byte[] {
-             0x49, 0x44, 0x33, 3, 1, 0,
-             0, 0, 0, 0
-       };
-       
-       assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
-       assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
-       
-       ID3v2Frame f = (ID3v2Frame)
-            ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
-       assertEquals(3, f.getMajorVersion());
-       assertEquals(1, f.getMinorVersion());
-       assertEquals(0, f.getFlags());
-       assertEquals(0, f.getLength());
-       assertEquals(0, f.getData().length);
-       
-       assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
-       assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
-       assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
-    }
-
-    @Test
-    public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3noid3.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals("2455.510986328125", metadata.get(XMPDM.DURATION));
-    }
-    
-    /**
-     * This test will do nothing, unless you've downloaded the
-     *  mp3 file from TIKA-424 - the file cannot be
-     *  distributed with Tika.
-     * This test will check for the complicated set of ID3v2.4
-     *  tags.
-     */
-    @Test
-    public void testTIKA424() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/test2.mp3")) {
-            if (stream == null) {
-                // You haven't downloaded the file
-                // Skip the test
-                return;
-            }
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
-
-       String content = handler.toString();
-       assertContains("Plus loin vers l'ouest", content);
-       
-       assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
-       assertEquals("44100", metadata.get("samplerate"));
-       assertEquals("2", metadata.get("channels"));
-    }
-    
-    /**
-     * This tests that we can handle without errors (but perhaps not
-     *  all content) a file with a very very large ID3 frame that
-     *  has been truncated before the end of the ID3 tags.
-     * In this case, it is a file with JPEG data in the ID3, which
-     *  is trunacted before the end of the JPEG bit of the ID3 frame.
-     */
-    @Test
-    public void testTIKA474() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3truncated.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-       // Check we could get the headers from the start
-       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
-
-       String content = handler.toString();
-       assertContains("Girl you have no faith in medicine", content);
-       assertContains("The White Stripes", content);
-       assertContains("Elephant", content);
-       assertContains("2003", content);
-       
-       // File lacks any audio frames, so we can't know these
-       assertEquals(null, metadata.get("version"));
-       assertEquals(null, metadata.get("samplerate"));
-       assertEquals(null, metadata.get("channels"));
-    }
-
-    // TIKA-1024
-    @Test
-    public void testNakedUTF16BOM() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testNakedUTF16BOM.mp3")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("", metadata.get(XMPDM.GENRE));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing mp3 files.
+ */
+public class Mp3ParserTest {
+
+    /**
+     * Checks the duration of an MP3 file.
+     * @param metadata the metadata object
+     * @param expected the expected duration, rounded as seconds
+     */
+    private static void checkDuration(Metadata metadata, int expected) {
+        assertEquals("Wrong duration", expected,
+                Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
+    }
+
+    /**
+     * Test that with only ID3v1 tags, we get some information out   
+     */
+    @Test
+    public void testMp3ParsingID3v1() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v1.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that with only ID3v2 tags, we get the full
+     *  set of information out.
+     */
+    @Test
+    public void testMp3ParsingID3v2() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v2.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Check core properties
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        // Check the textual contents
+        String content = handler.toString();
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        assertContains(", track 1", content);
+        assertContains(", disc 1", content);
+        
+        // Check un-typed audio properties
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        
+        // Check XMPDM-typed audio properties
+        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+        assertEquals(null, metadata.get(XMPDM.COMPOSER));
+        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+        assertEquals("Rock", metadata.get(XMPDM.GENRE));
+        assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
+        assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
+        assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
+        assertEquals("1", metadata.get(XMPDM.COMPILATION));
+        
+        assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+        assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+        assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that with both id3v2 and id3v1, we prefer the
+     *  details from id3v2
+     */
+    @Test
+    public void testMp3ParsingID3v1v2() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v1_v2.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that with only ID3v2 tags, of version 2.4, we get the full
+     *  set of information out.
+     */
+    @Test
+    public void testMp3ParsingID3v24() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v24.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        assertContains(", disc 1", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+
+        // Check XMPDM-typed audio properties
+        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+        assertEquals(null, metadata.get(XMPDM.COMPOSER));
+        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+        assertEquals("Rock", metadata.get(XMPDM.GENRE));
+        assertEquals("1", metadata.get(XMPDM.COMPILATION));
+        
+        assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
+        assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
+    }
+    
+    /**
+     * Tests that a file with characters not in the ISO 8859-1
+     *  range is correctly handled
+     */
+    @Test
+    public void testMp3ParsingID3i18n() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3i18n.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+       assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+       assertEquals("Test Artist \u2468\u2460", metadata.get(Metadata.AUTHOR));
+       assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+       assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+       assertEquals(
+             "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment", 
+             metadata.get(XMPDM.LOG_COMMENT)
+       );
+       
+       assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+       assertEquals("44100", metadata.get("samplerate"));
+       assertEquals("1", metadata.get("channels"));
+       checkDuration(metadata, 2);
+   }
+    
+    
+    /**
+     * Tests that a file with both lyrics and
+     *  ID3v2 tags gets both extracted correctly
+     */
+    @Test
+    public void testMp3ParsingLyrics() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        // Note - our test file has a lyrics tag, but lacks any
+        //  lyrics in the tags, so we can't test that bit
+        // TODO Find a better sample file
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3lyrics.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        checkDuration(metadata, 1);
+    }
+    
+    @Test
+    public void testID3v2Frame() throws Exception {
+       byte[] empty = new byte[] {
+             0x49, 0x44, 0x33, 3, 1, 0,
+             0, 0, 0, 0
+       };
+       
+       assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
+       assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
+       
+       ID3v2Frame f = (ID3v2Frame)
+            ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+       assertEquals(3, f.getMajorVersion());
+       assertEquals(1, f.getMinorVersion());
+       assertEquals(0, f.getFlags());
+       assertEquals(0, f.getLength());
+       assertEquals(0, f.getData().length);
+       
+       assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
+       assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
+       assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
+    }
+
+    @Test
+    public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3noid3.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("2455.510986328125", metadata.get(XMPDM.DURATION));
+    }
+    
+    /**
+     * This test will do nothing, unless you've downloaded the
+     *  mp3 file from TIKA-424 - the file cannot be
+     *  distributed with Tika.
+     * This test will check for the complicated set of ID3v2.4
+     *  tags.
+     */
+    @Test
+    public void testTIKA424() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/test2.mp3")) {
+            if (stream == null) {
+                // You haven't downloaded the file
+                // Skip the test
+                return;
+            }
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
+       assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
+       assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
+
+       String content = handler.toString();
+       assertContains("Plus loin vers l'ouest", content);
+       
+       assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+       assertEquals("44100", metadata.get("samplerate"));
+       assertEquals("2", metadata.get("channels"));
+    }
+    
+    /**
+     * This tests that we can handle without errors (but perhaps not
+     *  all content) a file with a very very large ID3 frame that
+     *  has been truncated before the end of the ID3 tags.
+     * In this case, it is a file with JPEG data in the ID3, which
+     *  is trunacted before the end of the JPEG bit of the ID3 frame.
+     */
+    @Test
+    public void testTIKA474() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3truncated.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+       // Check we could get the headers from the start
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
+       assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
+       assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
+
+       String content = handler.toString();
+       assertContains("Girl you have no faith in medicine", content);
+       assertContains("The White Stripes", content);
+       assertContains("Elephant", content);
+       assertContains("2003", content);
+       
+       // File lacks any audio frames, so we can't know these
+       assertEquals(null, metadata.get("version"));
+       assertEquals(null, metadata.get("samplerate"));
+       assertEquals(null, metadata.get("channels"));
+    }
+
+    // TIKA-1024
+    @Test
+    public void testNakedUTF16BOM() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testNakedUTF16BOM.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("", metadata.get(XMPDM.GENRE));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 36c0efe..aeaf71e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -1,92 +1,92 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.ocr;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.junit.Test;
-
-public class TesseractOCRConfigTest extends TikaTest {
-
-    @Test
-    public void testNoConfig() throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
-        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
-        assertEquals("Invalid default language value", "eng", config.getLanguage());
-        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
-        assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
-        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid default timeout value", 120, config.getTimeout());
-    }
-
-    @Test
-    public void testPartialConfig() throws Exception {
-
-        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
-                "/test-properties/TesseractOCRConfig-partial.properties");
-
-        TesseractOCRConfig config = new TesseractOCRConfig(stream);
-        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
-        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
-        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
-        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
-        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
-        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
-    }
-
-    @Test
-    public void testFullConfig() throws Exception {
-
-        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
-                "/test-properties/TesseractOCRConfig-full.properties");
-
-        TesseractOCRConfig config = new TesseractOCRConfig(stream);
-        assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
-        assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
-        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
-        assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
-        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
-        assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
-    }
-
-    @Test(expected=IllegalArgumentException.class)
-    public void testValidateLanguage() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setLanguage("eng");
-        config.setLanguage("eng+fra");
-        assertTrue("Couldn't set valid values", true);
-        config.setLanguage("rm -Rf *");
-    }
-
-    @Test(expected=IllegalArgumentException.class)
-    public void testValidatePageSegMode() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setPageSegMode("0");
-        config.setPageSegMode("10");
-        assertTrue("Couldn't set valid values", true);
-        config.setPageSegMode("11");
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+public class TesseractOCRConfigTest extends TikaTest {
+
+    @Test
+    public void testNoConfig() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+        assertEquals("Invalid default language value", "eng", config.getLanguage());
+        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+        assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
+        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid default timeout value", 120, config.getTimeout());
+    }
+
+    @Test
+    public void testPartialConfig() throws Exception {
+
+        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+                "/test-properties/TesseractOCRConfig-partial.properties");
+
+        TesseractOCRConfig config = new TesseractOCRConfig(stream);
+        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+    }
+
+    @Test
+    public void testFullConfig() throws Exception {
+
+        InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+                "/test-properties/TesseractOCRConfig-full.properties");
+
+        TesseractOCRConfig config = new TesseractOCRConfig(stream);
+        assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+        assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
+        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+        assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
+        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+        assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidateLanguage() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setLanguage("eng");
+        config.setLanguage("eng+fra");
+        assertTrue("Couldn't set valid values", true);
+        config.setLanguage("rm -Rf *");
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidatePageSegMode() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setPageSegMode("0");
+        config.setPageSegMode("10");
+        assertTrue("Couldn't set valid values", true);
+        config.setPageSegMode("11");
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
index 147113e..d3a876e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
@@ -1,44 +1,44 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.video;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class FLVParserTest {
-
-    @Test
-    public void testFLV() throws Exception {
-        String path = "/test-documents/testFLV.flv";
-        Metadata metadata = new Metadata();
-
-        String content = new Tika().parseToString(
-                FLVParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("", content);
-        assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("true", metadata.get("hasVideo"));
-        assertEquals("false", metadata.get("stereo"));
-        assertEquals("true", metadata.get("hasAudio"));
-        assertEquals("120.0", metadata.get("height"));
-        assertEquals("16.0", metadata.get("audiosamplesize"));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class FLVParserTest {
+
+    @Test
+    public void testFLV() throws Exception {
+        String path = "/test-documents/testFLV.flv";
+        Metadata metadata = new Metadata();
+
+        String content = new Tika().parseToString(
+                FLVParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("", content);
+        assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("hasVideo"));
+        assertEquals("false", metadata.get("stereo"));
+        assertEquals("true", metadata.get("hasAudio"));
+        assertEquals("120.0", metadata.get("height"));
+        assertEquals("16.0", metadata.get("audiosamplesize"));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/pom.xml b/tika-parser-modules/tika-parser-office-module/pom.xml
index 4756328..4825076 100644
--- a/tika-parser-modules/tika-parser-office-module/pom.xml
+++ b/tika-parser-modules/tika-parser-office-module/pom.xml
@@ -1,126 +1,126 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-office-module</artifactId>
-  <name>Apache Tika parser office module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi</artifactId>
-      <version>${poi.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi-scratchpad</artifactId>
-      <version>${poi.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi-ooxml</artifactId>
-      <version>${poi.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>stax</groupId>
-          <artifactId>stax-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>xml-apis</groupId>
-          <artifactId>xml-apis</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>com.healthmarketscience.jackcess</groupId>
-      <artifactId>jackcess</artifactId>
-      <version>2.1.3</version>
-    </dependency>
-    <dependency>
-      <groupId>com.healthmarketscience.jackcess</groupId>
-      <artifactId>jackcess-encrypt</artifactId>
-      <version>2.1.1</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.bouncycastle</groupId>
-          <artifactId>bcprov-jdk15on</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <!-- PDFBox (in the tika-parser-pdf-module) and poi's ooxml
-         code relies on bouncy castle, as does jackcess-encrypt
-         Need to exclude the older library and include the newer one
-         if there is a conflict.
-         -->
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcprov-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.pff</groupId>
-      <artifactId>java-libpst</artifactId>
-      <version>0.8.1</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-web-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-office-module</artifactId>
+  <name>Apache Tika parser office module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>${poi.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-scratchpad</artifactId>
+      <version>${poi.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-ooxml</artifactId>
+      <version>${poi.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>stax</groupId>
+          <artifactId>stax-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>xml-apis</groupId>
+          <artifactId>xml-apis</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.healthmarketscience.jackcess</groupId>
+      <artifactId>jackcess</artifactId>
+      <version>2.1.3</version>
+    </dependency>
+    <dependency>
+      <groupId>com.healthmarketscience.jackcess</groupId>
+      <artifactId>jackcess-encrypt</artifactId>
+      <version>2.1.1</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.bouncycastle</groupId>
+          <artifactId>bcprov-jdk15on</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <!-- PDFBox (in the tika-parser-pdf-module) and poi's ooxml
+         code relies on bouncy castle, as does jackcess-encrypt
+         Need to exclude the older library and include the newer one
+         if there is a conflict.
+         -->
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcprov-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.pff</groupId>
+      <artifactId>java-libpst</artifactId>
+      <version>0.8.1</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-web-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
index 32a41ab..8f34381 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.office.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.office.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
index c3e85c1..94c5aa5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
@@ -1,112 +1,112 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserProxy;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ChmParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 5938777307516469802L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.application("vnd.ms-htmlhelp"),
-                    MediaType.application("chm"),
-                    MediaType.application("x-chm"))));
-
-    private final Parser htmlProxy;
-    
-    public ChmParser() {
-        this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
-    }
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-        ChmExtractor chmExtractor = new ChmExtractor(stream);
-
-        // metadata
-        metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
-
-        // content
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
-            final String entryName = entry.getName();
-            if (entryName.endsWith(".html") 
-                    || entryName.endsWith(".htm")
-            ) {
-//                AttributesImpl attrs = new AttributesImpl();
-//                attrs.addAttribute("", "name", "name", "String", entryName);
-//                xhtml.startElement("", "document", "document", attrs);
-                
-                byte[] data = chmExtractor.extractChmEntry(entry);
-
-                parsePage(data, xhtml);
-                
-//                xhtml.endElement("", "", "document");
-            }
-        }
-
-        xhtml.endDocument();
-    }
-
-
-    private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
-        InputStream stream = null;
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
-        ParseContext parser = new ParseContext();
-        try {
-            stream = new ByteArrayInputStream(byteObject);
-            htmlProxy.parse(stream, handler, metadata, parser);
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
-        } catch (IOException e) {
-            // Pushback overflow from tagsoup
-        }
-    }
-    
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 5938777307516469802L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-htmlhelp"),
+                    MediaType.application("chm"),
+                    MediaType.application("x-chm"))));
+
+    private final Parser htmlProxy;
+    
+    public ChmParser() {
+        this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
+    }
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        ChmExtractor chmExtractor = new ChmExtractor(stream);
+
+        // metadata
+        metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
+
+        // content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
+            final String entryName = entry.getName();
+            if (entryName.endsWith(".html") 
+                    || entryName.endsWith(".htm")
+            ) {
+//                AttributesImpl attrs = new AttributesImpl();
+//                attrs.addAttribute("", "name", "name", "String", entryName);
+//                xhtml.startElement("", "document", "document", attrs);
+                
+                byte[] data = chmExtractor.extractChmEntry(entry);
+
+                parsePage(data, xhtml);
+                
+//                xhtml.endElement("", "", "document");
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+
+    private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
+        InputStream stream = null;
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
+        ParseContext parser = new ParseContext();
+        try {
+            stream = new ByteArrayInputStream(byteObject);
+            htmlProxy.parse(stream, handler, metadata, parser);
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        } catch (IOException e) {
+            // Pushback overflow from tagsoup
+        }
+    }
+    
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
index 42b0830..e8bf1cc 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
@@ -1,39 +1,39 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- * 
- * Defines an accessor interface
- * 
- * @param <T>
- */
-public interface ChmAccessor<T> extends Serializable {
-    /**
-     * Parses chm accessor
-     * 
-     * @param data
-     *            chm file
-     * @param chmAccessor
-     * @throws TikaException 
-     */
-    void parse(byte[] data, T chmAccessor) throws TikaException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * 
+ * Defines an accessor interface
+ * 
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+    /**
+     * Parses chm accessor
+     * 
+     * @param data
+     *            chm file
+     * @param chmAccessor
+     * @throws TikaException 
+     */
+    void parse(byte[] data, T chmAccessor) throws TikaException;
+}

[15/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 9d9d372..bfec2ad 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -1,506 +1,506 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Ignore;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class WordParserTest extends TikaTest {
-
-    @Test
-    public void testWordParser() throws Exception {
-        try (InputStream input = WordParserTest.class.getResourceAsStream(
-                "/test-documents/testWORD.doc")) {
-            ContentHandler handler = new BodyContentHandler();
-            Metadata metadata = new Metadata();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/msword",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            assertContains("Sample Word Document", handler.toString());
-        }
-    }
-
-    @Test
-    public void testWordWithWAV() throws Exception {
-        try (InputStream input = WordParserTest.class.getResourceAsStream(
-                "/test-documents/Doc1_ole.doc")) {
-            ContentHandler handler = new BodyContentHandler();
-            Metadata metadata = new Metadata();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertContains("MSj00974840000[1].wav", handler.toString());
-        }
-    }
-
-    /**
-     * Test that the word converter is able to generate the
-     *  correct HTML for the document
-     */
-    @Test
-    public void testWordHTML() throws Exception {
-
-        // Try with a document containing various tables and
-        // formattings
-        XMLResult result = getXML("testWORD.doc");
-        String xml = result.xml;
-        Metadata metadata = result.metadata;
-
-        assertEquals(
-                "application/msword",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-        assertTrue(xml.contains("Sample Word Document"));
-
-        // Check that custom headings came through
-        assertTrue(xml.contains("<h1 class=\"title\">"));
-        // Regular headings
-        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
-        assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
-        // Bold and italic
-        assertTrue(xml.contains("<b>BOLD</b>"));
-        assertTrue(xml.contains("<i>ITALIC</i>"));
-        // Table
-        assertTrue(xml.contains("<table>"));
-        assertTrue(xml.contains("<td>"));
-        // TODO - Check for the nested table
-        // Links
-        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
-        // Paragraphs with other styles
-        assertTrue(xml.contains("<p class=\"signature\">This one"));
-
-        // Try with a document that contains images
-        xml = getXML("testWORD_3imgs.doc").xml;
-
-        // Images 1-3
-        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\""));
-        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\""));
-        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\""));
-
-        // Text too
-        assertTrue(xml.contains("<p>The end!"));
-
-        // TIKA-692: test document containing multiple
-        // character runs within a bold tag:
-        xml = getXML("testWORD_bold_character_runs.doc").xml;
-
-        // Make sure bold text arrived as single
-        // contiguous string even though Word parser
-        // handled this as 3 character runs
-        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
-
-        // TIKA-692: test document containing multiple
-        // character runs within a bold tag:
-        xml = getXML("testWORD_bold_character_runs2.doc").xml;
-
-        // Make sure bold text arrived as single
-        // contiguous string even though Word parser
-        // handled this as 3 character runs
-        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
-    }
-
-    @Test
-    public void testEmbeddedNames() throws Exception {
-        String result = getXML("testWORD_embedded_pdf.doc").xml;
-
-        // Make sure the embedded div comes out after "Here
-        // is the pdf file" and before "Bye Bye":
-        int i = result.indexOf("Here is the pdf file:");
-        assertTrue(i != -1);
-        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
-        assertTrue(j != -1);
-        int k = result.indexOf("Bye Bye");
-        assertTrue(k != -1);
-
-        assertTrue(i < j);
-        assertTrue(j < k);
-    }
-
-    // TIKA-982
-    @Test
-    public void testEmbeddedRTF() throws Exception {
-        String result = getXML("testWORD_embedded_rtf.doc").xml;
-        assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />"));
-        assertTrue(result.contains("_1404039792.rtf"));
-    }
-
-    // TIKA-1019
-    @Test
-    public void testDocumentLink() throws Exception {
-        String result = getXML("testDocumentLink.doc").xml;
-        assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />"));
-        assertTrue(result.contains("_1327495610.unknown"));
-    }
-
-    @Test
-    public void testWord6Parser() throws Exception {
-        try (InputStream input = WordParserTest.class.getResourceAsStream(
-                "/test-documents/testWORD6.doc")) {
-            ContentHandler handler = new BodyContentHandler();
-            Metadata metadata = new Metadata();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/msword",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
-            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
-            assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
-            assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
-        }
-    }
-
-    @Test
-    public void testVarious() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = WordParserTest.class.getResourceAsStream(
-                "/test-documents/testWORD_various.doc")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        //content = content.replaceAll("\\s+"," ");
-        assertContains("Footnote appears here", content);
-        assertContains("This is a footnote.", content);
-        assertContains("This is the header text.", content);
-        assertContains("This is the footer text.", content);
-        assertContains("Here is a text box", content);
-        assertContains("Bold", content);
-        assertContains("italic", content);
-        assertContains("underline", content);
-        assertContains("superscript", content);
-        assertContains("subscript", content);
-        assertContains("Here is a citation:", content);
-        assertContains("Figure 1 This is a caption for Figure 1", content);
-        assertContains("(Kramer)", content);
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
-        assertContains("This is a hyperlink", content);
-        assertContains("Here is a list:", content);
-        for(int row=1;row<=3;row++) {
-            //assertContains("�\tBullet " + row, content);
-            //assertContains("\u00b7\tBullet " + row, content);
-            assertContains("Bullet " + row, content);
-        }
-        assertContains("Here is a numbered list:", content);
-        for(int row=1;row<=3;row++) {
-            //assertContains(row + ")\tNumber bullet " + row, content);
-            //assertContains(row + ") Number bullet " + row, content);
-            // TODO: WordExtractor fails to number the bullets:
-            assertContains("Number bullet " + row, content);
-        }
-
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
-                assertContains("Row " + row + " Col " + col, content);
-            }
-        }
-
-        assertContains("Keyword1 Keyword2", content);
-        assertEquals("Keyword1 Keyword2",
-                     metadata.get(TikaCoreProperties.KEYWORDS));
-
-        assertContains("Subject is here", content);
-        // TODO: Move to OO subject in Tika 2.0
-        assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
-        assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
-
-        assertContains("Suddenly some Japanese text:", content);
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
-        // 6 other characters
-        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
-
-        assertContains("And then some Gothic text:", content);
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
-    }
-
-    /**
-     * TIKA-1044 - Handle documents where parts of the
-     *  text have no formatting or styles applied to them
-     */
-    @Test
-    public void testNoFormat() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = WordParserTest.class.getResourceAsStream(
-                "/test-documents/testWORD_no_format.doc")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Will generate an exception", content);
-    }
-
-    /**
-     * Ensures that custom OLE2 (HPSF) properties are extracted
-     */
-    @Test
-    public void testCustomProperties() throws Exception {
-        Metadata metadata = new Metadata();
-
-        try (InputStream input = WordParserTest.class.getResourceAsStream(
-                "/test-documents/testWORD_custom_props.doc")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            new OfficeParser().parse(input, handler, metadata, context);
-        }
-
-        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
-        assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
-        assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
-        assertEquals("1", metadata.get(Office.PAGE_COUNT));
-        assertEquals("2", metadata.get(Office.WORD_COUNT));
-        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
-        assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
-        assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
-        // TODO: Move to OO subject in Tika 2.0
-        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
-        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
-        assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
-        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
-        assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
-    }
-
-    @Test
-    public void testExceptions1() throws Exception {
-        XMLResult xml;
-        Level logLevelStart = Logger.getRootLogger().getLevel();
-        Logger.getRootLogger().setLevel(Level.ERROR);
-        try {
-            xml = getXML("testException1.doc");
-            assertContains("total population", xml.xml);
-            xml = getXML("testException2.doc");
-            assertContains("electric charge", xml.xml);
-        } finally {
-            Logger.getRootLogger().setLevel(logLevelStart);
-        }
-    }
-
-    @Test
-    public void testTabularSymbol() throws Exception {
-        assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
-    }
-
-    /**
-     * TIKA-1229 Hyperlinks in Headers should be output as such,
-     *  not plain text with control characters
-     */
-    @Test
-    public void testHeaderHyperlinks() throws Exception {
-        XMLResult result = getXML("testWORD_header_hyperlink.doc");
-        String xml = result.xml;
-        Metadata metadata = result.metadata;
-
-        assertEquals(
-                "application/msword",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
-        assertContains("example.com", xml);
-
-        // Check we don't have the special text HYPERLINK
-        assertFalse(xml.contains("HYPERLINK"));
-
-        // Check we do have the link
-        assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
-
-        // Check we do have the email
-        assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
-    }
-
-    @Test
-    public void testControlCharacter() throws Exception {
-        assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
-    }
-
-    @Test
-    public void testParagraphsAfterTables() throws Exception {
-        XMLResult result = getXML("test_TIKA-1251.doc");
-
-        String xml = result.xml;
-        Metadata metadata = result.metadata;
-
-        assertEquals(
-                "application/msword",
-                metadata.get(Metadata.CONTENT_TYPE));
-
-        assertContains("<p>1. Organisering av vakten:</p>", xml);
-
-    }
-
-    @Test
-    public void testHyperlinkStringIOOBESmartQuote() throws Exception {
-        //TIKA-1512, one cause: closing double quote is a smart quote
-        //test file contributed by user
-        XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
-        assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml);
-    }
-
-    @Test
-    @Ignore //until we determine whether we can include test docs or not
-    public void testHyperlinkStringLongNoCloseQuote() throws Exception {
-        //TIKA-1512, one cause: no closing quote on really long string
-        //test file derived from govdocs1 012152.doc
-        XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
-        assertContains("href=\"http://www.lexis.com", result.xml);
-    }
-
-    @Test
-    @Ignore //until we determine whether we can include test docs or not
-    public void testHyperlinkStringLongCarriageReturn() throws Exception {
-        //TIKA-1512, one cause: no closing quote, but carriage return
-        //test file derived from govdocs1 040044.doc
-        XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
-        assertContains("href=\"http://www.nib.org", result.xml);
-    }
-
-    @Test
-    public void testDOCParagraphNumbering() throws Exception {
-        String xml = getXML("testWORD_numbered_list.doc").xml;
-        assertContains("1) This", xml);
-        assertContains("a) Is", xml);
-        assertContains("i) A multi", xml);
-        assertContains("ii) Level", xml);
-        assertContains("1. Within cell 1", xml);
-        assertContains("b. Cell b", xml);
-        assertContains("iii) List", xml);
-        assertContains("2) foo", xml);
-        assertContains("ii) baz", xml);
-        assertContains("ii) foo", xml);
-        assertContains("II. bar", xml);
-        assertContains("6. six", xml);
-        assertContains("7. seven", xml);
-        assertContains("a. seven a", xml);
-        assertContains("e. seven e", xml);
-        assertContains("2. A ii 2", xml);
-        assertContains("3. page break list 3", xml);
-        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
-        assertContains("1.1.1. 1.1.1", xml);
-        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
-
-        assertContains("add a list here", xml);
-        //TODO: not currently pulling numbers out of comments
-        assertContains(">comment list 1", xml);
-
-    }
-
-    @Test
-    public void testDOCOverrideParagraphNumbering() throws Exception {
-        String xml = getXML("testWORD_override_list_numbering.doc").xml;
-
-        //Test 1
-        assertContains("1.1.1.1...1 1.1.1.1...1", xml);
-        assertContains("1st.2.3someText 1st.2.3someText", xml);
-        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
-        assertContains("5th 5th", xml);
-
-
-        //Test 2
-        assertContains("1.a.I 1.a.I", xml);
-        //test no reset because level 2 is not sufficient to reset
-        assertContains("1.b.III 1.b.III", xml);
-        //test restarted because of level 0's increment to 2
-        assertContains("2.a.I 2.a.I", xml);
-        //test handling of skipped level
-        assertContains("2.b 2.b", xml);
-
-        //Test 3
-        assertContains("(1)) (1))", xml);
-        //tests start level 1 at 17 and
-        assertContains("2.17 2.17", xml);
-        //tests that isLegal turns everything into decimal
-        assertContains("2.18.2.1 2.18.2.1", xml);
-        assertContains(">2 2", xml);
-
-        //Test4
-        assertContains(">1 1", xml);
-        assertContains(">A A", xml);
-        assertContains(">B B", xml);
-        assertContains(">C C", xml);
-        assertContains(">4 4", xml);
-
-        //Test5
-        assertContains(">00 00", xml);
-        assertContains(">01 01", xml);
-        assertContains(">01. 01.", xml);
-        assertContains(">01..1 01..1", xml);
-        assertContains(">02 02", xml);
-    }
-
-    @Test
-    public void testMultiAuthorsManagers() throws Exception {
-        XMLResult r = getXML("testWORD_multi_authors.doc");
-        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
-        assertEquals(3, authors.length);
-        assertEquals("author2", authors[1]);
-
-        String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
-        assertEquals(2, managers.length);
-        assertEquals("manager1", managers[0]);
-        assertEquals("manager2", managers[1]);
-    }
-
-    @Test
-    public void testOrigLocation() throws Exception {
-        Metadata metadata = getXML("testException2.doc").metadata;
-        List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
-        assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
-        assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
-    }
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class WordParserTest extends TikaTest {
+
+    @Test
+    public void testWordParser() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            assertContains("Sample Word Document", handler.toString());
+        }
+    }
+
+    @Test
+    public void testWordWithWAV() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/Doc1_ole.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertContains("MSj00974840000[1].wav", handler.toString());
+        }
+    }
+
+    /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    @Test
+    public void testWordHTML() throws Exception {
+
+        // Try with a document containing various tables and
+        // formattings
+        XMLResult result = getXML("testWORD.doc");
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+        assertTrue(xml.contains("Sample Word Document"));
+
+        // Check that custom headings came through
+        assertTrue(xml.contains("<h1 class=\"title\">"));
+        // Regular headings
+        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+        assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+        // Bold and italic
+        assertTrue(xml.contains("<b>BOLD</b>"));
+        assertTrue(xml.contains("<i>ITALIC</i>"));
+        // Table
+        assertTrue(xml.contains("<table>"));
+        assertTrue(xml.contains("<td>"));
+        // TODO - Check for the nested table
+        // Links
+        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+        // Paragraphs with other styles
+        assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+        // Try with a document that contains images
+        xml = getXML("testWORD_3imgs.doc").xml;
+
+        // Images 1-3
+        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\""));
+        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\""));
+        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\""));
+
+        // Text too
+        assertTrue(xml.contains("<p>The end!"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs.doc").xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs2.doc").xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+    }
+
+    @Test
+    public void testEmbeddedNames() throws Exception {
+        String result = getXML("testWORD_embedded_pdf.doc").xml;
+
+        // Make sure the embedded div comes out after "Here
+        // is the pdf file" and before "Bye Bye":
+        int i = result.indexOf("Here is the pdf file:");
+        assertTrue(i != -1);
+        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
+        assertTrue(j != -1);
+        int k = result.indexOf("Bye Bye");
+        assertTrue(k != -1);
+
+        assertTrue(i < j);
+        assertTrue(j < k);
+    }
+
+    // TIKA-982
+    @Test
+    public void testEmbeddedRTF() throws Exception {
+        String result = getXML("testWORD_embedded_rtf.doc").xml;
+        assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />"));
+        assertTrue(result.contains("_1404039792.rtf"));
+    }
+
+    // TIKA-1019
+    @Test
+    public void testDocumentLink() throws Exception {
+        String result = getXML("testDocumentLink.doc").xml;
+        assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />"));
+        assertTrue(result.contains("_1327495610.unknown"));
+    }
+
+    @Test
+    public void testWord6Parser() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD6.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
+            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+            assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+            assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
+        }
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_various.doc")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("�\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: WordExtractor fails to number the bullets:
+            assertContains("Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        // TODO: Move to OO subject in Tika 2.0
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    /**
+     * TIKA-1044 - Handle documents where parts of the
+     *  text have no formatting or styles applied to them
+     */
+    @Test
+    public void testNoFormat() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_no_format.doc")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Will generate an exception", content);
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_custom_props.doc")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("1", metadata.get(Office.PAGE_COUNT));
+        assertEquals("2", metadata.get(Office.WORD_COUNT));
+        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+        assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
+        // TODO: Move to OO subject in Tika 2.0
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
+    }
+
+    @Test
+    public void testExceptions1() throws Exception {
+        XMLResult xml;
+        Level logLevelStart = Logger.getRootLogger().getLevel();
+        Logger.getRootLogger().setLevel(Level.ERROR);
+        try {
+            xml = getXML("testException1.doc");
+            assertContains("total population", xml.xml);
+            xml = getXML("testException2.doc");
+            assertContains("electric charge", xml.xml);
+        } finally {
+            Logger.getRootLogger().setLevel(logLevelStart);
+        }
+    }
+
+    @Test
+    public void testTabularSymbol() throws Exception {
+        assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
+    }
+
+    /**
+     * TIKA-1229 Hyperlinks in Headers should be output as such,
+     *  not plain text with control characters
+     */
+    @Test
+    public void testHeaderHyperlinks() throws Exception {
+        XMLResult result = getXML("testWORD_header_hyperlink.doc");
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
+        assertContains("example.com", xml);
+
+        // Check we don't have the special text HYPERLINK
+        assertFalse(xml.contains("HYPERLINK"));
+
+        // Check we do have the link
+        assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
+
+        // Check we do have the email
+        assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
+    }
+
+    @Test
+    public void testControlCharacter() throws Exception {
+        assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+    }
+
+    @Test
+    public void testParagraphsAfterTables() throws Exception {
+        XMLResult result = getXML("test_TIKA-1251.doc");
+
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        assertContains("<p>1. Organisering av vakten:</p>", xml);
+
+    }
+
+    @Test
+    public void testHyperlinkStringIOOBESmartQuote() throws Exception {
+        //TIKA-1512, one cause: closing double quote is a smart quote
+        //test file contributed by user
+        XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
+        assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongNoCloseQuote() throws Exception {
+        //TIKA-1512, one cause: no closing quote on really long string
+        //test file derived from govdocs1 012152.doc
+        XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
+        assertContains("href=\"http://www.lexis.com", result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongCarriageReturn() throws Exception {
+        //TIKA-1512, one cause: no closing quote, but carriage return
+        //test file derived from govdocs1 040044.doc
+        XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
+        assertContains("href=\"http://www.nib.org", result.xml);
+    }
+
+    @Test
+    public void testDOCParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_numbered_list.doc").xml;
+        assertContains("1) This", xml);
+        assertContains("a) Is", xml);
+        assertContains("i) A multi", xml);
+        assertContains("ii) Level", xml);
+        assertContains("1. Within cell 1", xml);
+        assertContains("b. Cell b", xml);
+        assertContains("iii) List", xml);
+        assertContains("2) foo", xml);
+        assertContains("ii) baz", xml);
+        assertContains("ii) foo", xml);
+        assertContains("II. bar", xml);
+        assertContains("6. six", xml);
+        assertContains("7. seven", xml);
+        assertContains("a. seven a", xml);
+        assertContains("e. seven e", xml);
+        assertContains("2. A ii 2", xml);
+        assertContains("3. page break list 3", xml);
+        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+        assertContains("1.1.1. 1.1.1", xml);
+        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
+
+        assertContains("add a list here", xml);
+        //TODO: not currently pulling numbers out of comments
+        assertContains(">comment list 1", xml);
+
+    }
+
+    @Test
+    public void testDOCOverrideParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_override_list_numbering.doc").xml;
+
+        //Test 1
+        assertContains("1.1.1.1...1 1.1.1.1...1", xml);
+        assertContains("1st.2.3someText 1st.2.3someText", xml);
+        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+        assertContains("5th 5th", xml);
+
+
+        //Test 2
+        assertContains("1.a.I 1.a.I", xml);
+        //test no reset because level 2 is not sufficient to reset
+        assertContains("1.b.III 1.b.III", xml);
+        //test restarted because of level 0's increment to 2
+        assertContains("2.a.I 2.a.I", xml);
+        //test handling of skipped level
+        assertContains("2.b 2.b", xml);
+
+        //Test 3
+        assertContains("(1)) (1))", xml);
+        //tests start level 1 at 17 and
+        assertContains("2.17 2.17", xml);
+        //tests that isLegal turns everything into decimal
+        assertContains("2.18.2.1 2.18.2.1", xml);
+        assertContains(">2 2", xml);
+
+        //Test4
+        assertContains(">1 1", xml);
+        assertContains(">A A", xml);
+        assertContains(">B B", xml);
+        assertContains(">C C", xml);
+        assertContains(">4 4", xml);
+
+        //Test5
+        assertContains(">00 00", xml);
+        assertContains(">01 01", xml);
+        assertContains(">01. 01.", xml);
+        assertContains(">01..1 01..1", xml);
+        assertContains(">02 02", xml);
+    }
+
+    @Test
+    public void testMultiAuthorsManagers() throws Exception {
+        XMLResult r = getXML("testWORD_multi_authors.doc");
+        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+        assertEquals(3, authors.length);
+        assertEquals("author2", authors[1]);
+
+        String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+        assertEquals(2, managers.length);
+        assertEquals("manager1", managers[0]);
+        assertEquals("manager2", managers[1]);
+    }
+
+    @Test
+    public void testOrigLocation() throws Exception {
+        Metadata metadata = getXML("testException2.doc").metadata;
+        List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+        assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 24551bc..15f0c74 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -1,340 +1,340 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.opendocument.OpenOfficeParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ODFParserTest extends TikaTest {
-    /**
-     * For now, allow us to run some tests against both
-     *  the old and the new parser
-     */
-    private Parser[] getParsers() {
-       return new Parser[] {
-             new OpenDocumentParser(),
-             new OpenOfficeParser()
-       };
-    }
-
-    @Test
-    public void testOO3() throws Exception {
-       for (Parser parser : getParsers()) {
-           XMLResult r = getXML("testODFwithOOo3.odt", parser);
-           assertEquals(
-                   "application/vnd.oasis.opendocument.text",
-                   r.metadata.get(Metadata.CONTENT_TYPE));
-
-           String content = r.xml;
-           assertContains("Tika is part of the Lucene project.", content);
-           assertContains("Solr", content);
-           assertContains("one embedded", content);
-           assertContains("Rectangle Title", content);
-           assertContains("a blue background and dark border", content);
-
-       }
-    }
-
-    @Test
-    public void testOO2() throws Exception {
-        for (Parser parser : getParsers()) {
-            XMLResult r = getXML("testOpenOffice2.odt", parser);
-            Metadata metadata = r.metadata;
-            assertEquals(
-                    "application/vnd.oasis.opendocument.text",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
-            assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
-            assertEquals(
-                    "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
-                    metadata.get("generator"));
-
-            // Check date metadata, both old-style and new-style
-            assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
-            assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
-            assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
-            assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
-            assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
-
-            // Check the document statistics
-            assertEquals("1", metadata.get(Office.PAGE_COUNT));
-            assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
-            assertEquals("14", metadata.get(Office.WORD_COUNT));
-            assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
-            assertEquals("0", metadata.get(Office.TABLE_COUNT));
-            assertEquals("0", metadata.get(Office.OBJECT_COUNT));
-            assertEquals("0", metadata.get(Office.IMAGE_COUNT));
-
-            // Check the Tika-1.0 style document statistics
-            assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
-            assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
-            assertEquals("14", metadata.get(Metadata.WORD_COUNT));
-            assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
-            assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
-            assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
-            assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
-
-            // Check the very old style statistics (these will be removed shortly)
-            assertEquals("0", metadata.get("nbTab"));
-            assertEquals("0", metadata.get("nbObject"));
-            assertEquals("0", metadata.get("nbImg"));
-            assertEquals("1", metadata.get("nbPage"));
-            assertEquals("1", metadata.get("nbPara"));
-            assertEquals("14", metadata.get("nbWord"));
-            assertEquals("78", metadata.get("nbCharacter"));
-
-            // Custom metadata tags present but without values
-            assertEquals(null, metadata.get("custom:Info 1"));
-            assertEquals(null, metadata.get("custom:Info 2"));
-            assertEquals(null, metadata.get("custom:Info 3"));
-            assertEquals(null, metadata.get("custom:Info 4"));
-
-            assertContains(
-                    "This is a sample Open Office document,"
-                            + " written in NeoOffice 2.2.1 for the Mac.",
-                    r.xml);
-
-        }
-   }
-
-   /**
-    * Similar to {@link #testOO2()}, but using a different
-    *  OO2 file with different metadata in it
-    */
-    @Test
-    public void testOO2Metadata() throws Exception {
-        XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser());
-        Metadata metadata = r.metadata;
-        assertEquals(
-                "application/vnd.oasis.opendocument.formula",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("The quick brown fox jumps over the lazy dog",
-                metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Gym class featuring a brown fox and lazy dog",
-                metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Gym class featuring a brown fox and lazy dog",
-                metadata.get(OfficeOpenXMLCore.SUBJECT));
-        assertEquals("Gym class featuring a brown fox and lazy dog",
-                metadata.get(Metadata.SUBJECT));
-        assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
-        assertEquals("1", metadata.get("editing-cycles"));
-        assertEquals(
-                "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
-                metadata.get("generator"));
-        assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
-
-        // User defined metadata
-        assertEquals("Text 1", metadata.get("custom:Info 1"));
-        assertEquals("2", metadata.get("custom:Info 2"));
-        assertEquals("false", metadata.get("custom:Info 3"));
-        assertEquals("true", metadata.get("custom:Info 4"));
-
-        // No statistics present
-        assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
-        assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
-        assertEquals(null, metadata.get(Metadata.WORD_COUNT));
-        assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
-        assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
-        assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
-        assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
-        assertEquals(null, metadata.get("nbTab"));
-        assertEquals(null, metadata.get("nbObject"));
-        assertEquals(null, metadata.get("nbImg"));
-        assertEquals(null, metadata.get("nbPage"));
-        assertEquals(null, metadata.get("nbPara"));
-        assertEquals(null, metadata.get("nbWord"));
-        assertEquals(null, metadata.get("nbCharacter"));
-
-        // Note - contents of maths files not currently supported
-        assertContains("<body />", r.xml);
-
-   }
-
-   /**
-    * Similar to {@link #testOO2()} )}, but using an OO3 file
-    */
-    @Test
-   public void testOO3Metadata() throws Exception {
-        XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser());
-        Metadata metadata = r.metadata;
-        assertEquals(
-                "application/vnd.oasis.opendocument.text",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
-        assertEquals("Test document", metadata.get(Metadata.SUBJECT));
-        assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Bart Hanssens", metadata.get("initial-creator"));
-        assertEquals("2", metadata.get("editing-cycles"));
-        assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
-        assertEquals(
-                "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
-                metadata.get("generator"));
-        assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
-
-        // User defined metadata
-        assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
-        assertEquals(null, metadata.get("custom:Info 2"));
-        assertEquals(null, metadata.get("custom:Info 3"));
-        assertEquals(null, metadata.get("custom:Info 4"));
-
-        // Check the document statistics
-        assertEquals("2", metadata.get(Office.PAGE_COUNT));
-        assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
-        assertEquals("54", metadata.get(Office.WORD_COUNT));
-        assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
-        assertEquals("0", metadata.get(Office.TABLE_COUNT));
-        assertEquals("2", metadata.get(Office.OBJECT_COUNT));
-        assertEquals("0", metadata.get(Office.IMAGE_COUNT));
-
-        // Check the Tika-1.0 style document statistics
-        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
-        assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
-        assertEquals("54", metadata.get(Metadata.WORD_COUNT));
-        assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
-        assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
-        assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
-        assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
-
-        // Check the old style statistics (these will be removed shortly)
-        assertEquals("0", metadata.get("nbTab"));
-        assertEquals("2", metadata.get("nbObject"));
-        assertEquals("0", metadata.get("nbImg"));
-        assertEquals("2", metadata.get("nbPage"));
-        assertEquals("13", metadata.get("nbPara"));
-        assertEquals("54", metadata.get("nbWord"));
-        assertEquals("351", metadata.get("nbCharacter"));
-
-        assertContains(
-                "Tika is part of the Lucene project.", r.xml);
-
-
-   }
-
-    @Test
-    public void testODPMasterFooter() throws Exception {
-        assertContains("Master footer is here",
-                getXML("testMasterFooter.odp").xml);
-    }
-
-    @Test
-    public void testODTFooter() throws Exception {
-        XMLResult r = getXML("testFooter.odt");
-        assertContains("Here is some text...", r.xml);
-        assertContains("Here is some text on page 2", r.xml);
-        assertContains("Here is footer text", r.xml);
-    }
-
-    @Test
-    public void testODSFooter() throws Exception {
-        assertContains("Here is a footer in the center area",
-                getXML("testFooter.ods").xml);
-
-    }  
-    
-    @Test
-    public void testFromFile() throws Exception {
-        OpenDocumentParser parser = new OpenDocumentParser();
-        Path tmp = null;
-        try {
-            tmp = Files.createTempFile("test-odf-", ".odt");
-            Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp,
-                    StandardCopyOption.REPLACE_EXISTING);
-            Metadata metadata = new Metadata();
-            TikaInputStream tis = TikaInputStream.get(tmp, metadata);
-            assertEquals(true, tis.hasFile());
-            ContentHandler handler = new BodyContentHandler();
-            parser.parse(tis, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/vnd.oasis.opendocument.text",
-                    metadata.get(Metadata.CONTENT_TYPE));
-
-            String content = handler.toString();
-            assertContains("Tika is part of the Lucene project.", content);
-        } finally {
-            Files.delete(tmp);
-        }
-    }
-
-    @Test
-    public void testNPEFromFile() throws Exception {
-        XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser());
-        assertEquals(
-                "application/vnd.oasis.opendocument.text",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        assertContains("primero hay que generar un par de claves", r.xml);
-
-    }
-
-    // TIKA-1063: Test basic style support.
-    @Test
-    public void testODTStyles() throws Exception {
-        String xml = getXML("testStyles.odt").xml;
-        assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
-        assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
-        assertContains("<ol>\t<li><p>One</p>", xml);
-        assertContains("</ol>", xml);
-        assertContains("<ul>\t<li><p>First</p>", xml);
-        assertContains("</ul>", xml);
-    }
-
-    //TIKA-1600: Test that null pointer doesn't break parsing.
-    @Test
-    public void testNullStylesInODTFooter() throws Exception {
-
-        XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext());
-
-        assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = r.xml;
-
-        assertContains("Utilisation de ce document", content);
-        assertContains("Copyright and License", content);
-        assertContains("Changer la langue", content);
-        assertContains("La page d\u2019accueil permet de faire une recherche simple", content);
-
-    }
-    @Test  //TIKA-1916
-    public void testMissingMeta() throws Exception {
-        String xml = getXML("testODTNoMeta.odt").xml;
-        assertContains("Test text", xml);
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.junit.Assert.assertEquals;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.opendocument.OpenOfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ODFParserTest extends TikaTest {
+    /**
+     * For now, allow us to run some tests against both
+     *  the old and the new parser
+     */
+    private Parser[] getParsers() {
+       return new Parser[] {
+             new OpenDocumentParser(),
+             new OpenOfficeParser()
+       };
+    }
+
+    @Test
+    public void testOO3() throws Exception {
+       for (Parser parser : getParsers()) {
+           XMLResult r = getXML("testODFwithOOo3.odt", parser);
+           assertEquals(
+                   "application/vnd.oasis.opendocument.text",
+                   r.metadata.get(Metadata.CONTENT_TYPE));
+
+           String content = r.xml;
+           assertContains("Tika is part of the Lucene project.", content);
+           assertContains("Solr", content);
+           assertContains("one embedded", content);
+           assertContains("Rectangle Title", content);
+           assertContains("a blue background and dark border", content);
+
+       }
+    }
+
+    @Test
+    public void testOO2() throws Exception {
+        for (Parser parser : getParsers()) {
+            XMLResult r = getXML("testOpenOffice2.odt", parser);
+            Metadata metadata = r.metadata;
+            assertEquals(
+                    "application/vnd.oasis.opendocument.text",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
+            assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
+            assertEquals(
+                    "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
+                    metadata.get("generator"));
+
+            // Check date metadata, both old-style and new-style
+            assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
+            assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
+            assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
+            assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
+            assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
+
+            // Check the document statistics
+            assertEquals("1", metadata.get(Office.PAGE_COUNT));
+            assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
+            assertEquals("14", metadata.get(Office.WORD_COUNT));
+            assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
+            assertEquals("0", metadata.get(Office.TABLE_COUNT));
+            assertEquals("0", metadata.get(Office.OBJECT_COUNT));
+            assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+            // Check the Tika-1.0 style document statistics
+            assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
+            assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
+            assertEquals("14", metadata.get(Metadata.WORD_COUNT));
+            assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
+            assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+            assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
+            assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
+
+            // Check the very old style statistics (these will be removed shortly)
+            assertEquals("0", metadata.get("nbTab"));
+            assertEquals("0", metadata.get("nbObject"));
+            assertEquals("0", metadata.get("nbImg"));
+            assertEquals("1", metadata.get("nbPage"));
+            assertEquals("1", metadata.get("nbPara"));
+            assertEquals("14", metadata.get("nbWord"));
+            assertEquals("78", metadata.get("nbCharacter"));
+
+            // Custom metadata tags present but without values
+            assertEquals(null, metadata.get("custom:Info 1"));
+            assertEquals(null, metadata.get("custom:Info 2"));
+            assertEquals(null, metadata.get("custom:Info 3"));
+            assertEquals(null, metadata.get("custom:Info 4"));
+
+            assertContains(
+                    "This is a sample Open Office document,"
+                            + " written in NeoOffice 2.2.1 for the Mac.",
+                    r.xml);
+
+        }
+   }
+
+   /**
+    * Similar to {@link #testOO2()}, but using a different
+    *  OO2 file with different metadata in it
+    */
+    @Test
+    public void testOO2Metadata() throws Exception {
+        XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser());
+        Metadata metadata = r.metadata;
+        assertEquals(
+                "application/vnd.oasis.opendocument.formula",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("The quick brown fox jumps over the lazy dog",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
+        assertEquals("1", metadata.get("editing-cycles"));
+        assertEquals(
+                "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
+                metadata.get("generator"));
+        assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+
+        // User defined metadata
+        assertEquals("Text 1", metadata.get("custom:Info 1"));
+        assertEquals("2", metadata.get("custom:Info 2"));
+        assertEquals("false", metadata.get("custom:Info 3"));
+        assertEquals("true", metadata.get("custom:Info 4"));
+
+        // No statistics present
+        assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
+        assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
+        assertEquals(null, metadata.get(Metadata.WORD_COUNT));
+        assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
+        assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
+        assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
+        assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
+        assertEquals(null, metadata.get("nbTab"));
+        assertEquals(null, metadata.get("nbObject"));
+        assertEquals(null, metadata.get("nbImg"));
+        assertEquals(null, metadata.get("nbPage"));
+        assertEquals(null, metadata.get("nbPara"));
+        assertEquals(null, metadata.get("nbWord"));
+        assertEquals(null, metadata.get("nbCharacter"));
+
+        // Note - contents of maths files not currently supported
+        assertContains("<body />", r.xml);
+
+   }
+
+   /**
+    * Similar to {@link #testOO2()} )}, but using an OO3 file
+    */
+    @Test
+   public void testOO3Metadata() throws Exception {
+        XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser());
+        Metadata metadata = r.metadata;
+        assertEquals(
+                "application/vnd.oasis.opendocument.text",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Test document", metadata.get(Metadata.SUBJECT));
+        assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Bart Hanssens", metadata.get("initial-creator"));
+        assertEquals("2", metadata.get("editing-cycles"));
+        assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+        assertEquals(
+                "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+                metadata.get("generator"));
+        assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
+
+        // User defined metadata
+        assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
+        assertEquals(null, metadata.get("custom:Info 2"));
+        assertEquals(null, metadata.get("custom:Info 3"));
+        assertEquals(null, metadata.get("custom:Info 4"));
+
+        // Check the document statistics
+        assertEquals("2", metadata.get(Office.PAGE_COUNT));
+        assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
+        assertEquals("54", metadata.get(Office.WORD_COUNT));
+        assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+        assertEquals("0", metadata.get(Office.TABLE_COUNT));
+        assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+        assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+        // Check the Tika-1.0 style document statistics
+        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+        assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
+        assertEquals("54", metadata.get(Metadata.WORD_COUNT));
+        assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
+        assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+        assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
+        assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
+
+        // Check the old style statistics (these will be removed shortly)
+        assertEquals("0", metadata.get("nbTab"));
+        assertEquals("2", metadata.get("nbObject"));
+        assertEquals("0", metadata.get("nbImg"));
+        assertEquals("2", metadata.get("nbPage"));
+        assertEquals("13", metadata.get("nbPara"));
+        assertEquals("54", metadata.get("nbWord"));
+        assertEquals("351", metadata.get("nbCharacter"));
+
+        assertContains(
+                "Tika is part of the Lucene project.", r.xml);
+
+
+   }
+
+    @Test
+    public void testODPMasterFooter() throws Exception {
+        assertContains("Master footer is here",
+                getXML("testMasterFooter.odp").xml);
+    }
+
+    @Test
+    public void testODTFooter() throws Exception {
+        XMLResult r = getXML("testFooter.odt");
+        assertContains("Here is some text...", r.xml);
+        assertContains("Here is some text on page 2", r.xml);
+        assertContains("Here is footer text", r.xml);
+    }
+
+    @Test
+    public void testODSFooter() throws Exception {
+        assertContains("Here is a footer in the center area",
+                getXML("testFooter.ods").xml);
+
+    }  
+    
+    @Test
+    public void testFromFile() throws Exception {
+        OpenDocumentParser parser = new OpenDocumentParser();
+        Path tmp = null;
+        try {
+            tmp = Files.createTempFile("test-odf-", ".odt");
+            Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp,
+                    StandardCopyOption.REPLACE_EXISTING);
+            Metadata metadata = new Metadata();
+            TikaInputStream tis = TikaInputStream.get(tmp, metadata);
+            assertEquals(true, tis.hasFile());
+            ContentHandler handler = new BodyContentHandler();
+            parser.parse(tis, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/vnd.oasis.opendocument.text",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            String content = handler.toString();
+            assertContains("Tika is part of the Lucene project.", content);
+        } finally {
+            Files.delete(tmp);
+        }
+    }
+
+    @Test
+    public void testNPEFromFile() throws Exception {
+        XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser());
+        assertEquals(
+                "application/vnd.oasis.opendocument.text",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        assertContains("primero hay que generar un par de claves", r.xml);
+
+    }
+
+    // TIKA-1063: Test basic style support.
+    @Test
+    public void testODTStyles() throws Exception {
+        String xml = getXML("testStyles.odt").xml;
+        assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
+        assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
+        assertContains("<ol>\t<li><p>One</p>", xml);
+        assertContains("</ol>", xml);
+        assertContains("<ul>\t<li><p>First</p>", xml);
+        assertContains("</ul>", xml);
+    }
+
+    //TIKA-1600: Test that null pointer doesn't break parsing.
+    @Test
+    public void testNullStylesInODTFooter() throws Exception {
+
+        XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext());
+
+        assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = r.xml;
+
+        assertContains("Utilisation de ce document", content);
+        assertContains("Copyright and License", content);
+        assertContains("Changer la langue", content);
+        assertContains("La page d\u2019accueil permet de faire une recherche simple", content);
+
+    }
+    @Test  //TIKA-1916
+    public void testMissingMeta() throws Exception {
+        String xml = getXML("testODTNoMeta.odt").xml;
+        assertContains("Test text", xml);
+    }
+}

[25/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
index 119a47b..e423871 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-public class ChmConstants {
-    /* Prevents instantiation */
-    private ChmConstants() {
-    }
-
-    public static final String DEFAULT_CHARSET = UTF_8.name();
-    public static final String ITSF = "ITSF";
-    public static final String ITSP = "ITSP";
-    public static final String PMGL = "PMGL";
-    public static final String LZXC = "LZXC";
-    public static final String CHM_PMGI_MARKER = "PMGI";
-    public static final int BYTE_ARRAY_LENGHT = 16;
-    public static final int CHM_ITSF_V2_LEN = 0x58;
-    public static final int CHM_ITSF_V3_LEN = 0x60;
-    public static final int CHM_ITSP_V1_LEN = 0x54;
-    public static final int CHM_PMGL_LEN = 0x14;
-    public static final int CHM_PMGI_LEN = 0x08;
-    public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
-    public static final int CHM_LZXC_MIN_LEN = 0x18;
-    public static final int CHM_LZXC_V2_LEN = 0x1c;
-    public static final int CHM_SIGNATURE_LEN = 4;
-    public static final int CHM_VER_2 = 2;
-    public static final int CHM_VER_3 = 3;
-    public static final int CHM_VER_1 = 1;
-    public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
-
-    /* my hacking */
-    public static final int START_PMGL = 0xCC;
-    public static final String CONTROL_DATA = "ControlData";
-    public static final String RESET_TABLE = "ResetTable";
-    public static final String CONTENT = "Content";
-
-    /* some constants defined by the LZX specification */
-    public static final int LZX_MIN_MATCH = 2;
-    public static final int LZX_MAX_MATCH = 257;
-    public static final int LZX_NUM_CHARS = 256;
-    public static final int LZX_BLOCKTYPE_INVALID = 0; /*
-                                                        * also blocktypes 4-7
-                                                        * invalid
-                                                        */
-    public static final int LZX_BLOCKTYPE_VERBATIM = 1;
-    public static final int LZX_BLOCKTYPE_ALIGNED = 2;
-    public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
-    public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
-    public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
-    public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
-                                                           * aligned offset tree
-                                                           * #elements
-                                                           */
-    public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
-                                                          * this one missing
-                                                          * from spec!
-                                                          */
-    public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
-                                                              * length tree
-                                                              * #elements
-                                                              */
-
-    /* LZX huffman defines: tweak tablebits as desired */
-    public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
-    public static final int LZX_PRETREE_TABLEBITS = 6;
-    public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
-    public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
-    public static final int LZX_MAINTREE_TABLEBITS = 12;
-    public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
-    public static final int LZX_LENGTH_TABLEBITS = 12;
-    public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
-    public static final int LZX_ALIGNED_TABLEBITS = 7;
-    public static final int LZX_LENTABLE_SAFETY = 64;
-
-    public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
-            5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
-            15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
-            17, 17 };
-
-    public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
-            48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
-            4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
-            131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
-            1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
-            1966080, 2097152 };
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class ChmConstants {
+    /* Prevents instantiation */
+    private ChmConstants() {
+    }
+
+    public static final String DEFAULT_CHARSET = UTF_8.name();
+    public static final String ITSF = "ITSF";
+    public static final String ITSP = "ITSP";
+    public static final String PMGL = "PMGL";
+    public static final String LZXC = "LZXC";
+    public static final String CHM_PMGI_MARKER = "PMGI";
+    public static final int BYTE_ARRAY_LENGHT = 16;
+    public static final int CHM_ITSF_V2_LEN = 0x58;
+    public static final int CHM_ITSF_V3_LEN = 0x60;
+    public static final int CHM_ITSP_V1_LEN = 0x54;
+    public static final int CHM_PMGL_LEN = 0x14;
+    public static final int CHM_PMGI_LEN = 0x08;
+    public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
+    public static final int CHM_LZXC_MIN_LEN = 0x18;
+    public static final int CHM_LZXC_V2_LEN = 0x1c;
+    public static final int CHM_SIGNATURE_LEN = 4;
+    public static final int CHM_VER_2 = 2;
+    public static final int CHM_VER_3 = 3;
+    public static final int CHM_VER_1 = 1;
+    public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
+
+    /* my hacking */
+    public static final int START_PMGL = 0xCC;
+    public static final String CONTROL_DATA = "ControlData";
+    public static final String RESET_TABLE = "ResetTable";
+    public static final String CONTENT = "Content";
+
+    /* some constants defined by the LZX specification */
+    public static final int LZX_MIN_MATCH = 2;
+    public static final int LZX_MAX_MATCH = 257;
+    public static final int LZX_NUM_CHARS = 256;
+    public static final int LZX_BLOCKTYPE_INVALID = 0; /*
+                                                        * also blocktypes 4-7
+                                                        * invalid
+                                                        */
+    public static final int LZX_BLOCKTYPE_VERBATIM = 1;
+    public static final int LZX_BLOCKTYPE_ALIGNED = 2;
+    public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
+    public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
+    public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
+    public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
+                                                           * aligned offset tree
+                                                           * #elements
+                                                           */
+    public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
+                                                          * this one missing
+                                                          * from spec!
+                                                          */
+    public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
+                                                              * length tree
+                                                              * #elements
+                                                              */
+
+    /* LZX huffman defines: tweak tablebits as desired */
+    public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
+    public static final int LZX_PRETREE_TABLEBITS = 6;
+    public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
+    public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
+    public static final int LZX_MAINTREE_TABLEBITS = 12;
+    public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
+    public static final int LZX_LENGTH_TABLEBITS = 12;
+    public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
+    public static final int LZX_ALIGNED_TABLEBITS = 7;
+    public static final int LZX_LENTABLE_SAFETY = 64;
+
+    public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
+            5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+            15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+            17, 17 };
+
+    public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
+            48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
+            4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
+            131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
+            1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
+            1966080, 2097152 };
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
index 85f4177..454c1c4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
@@ -1,392 +1,392 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
-import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Extracts text from chm file. Enumerates chm entries.
- */
-public class ChmExtractor {
-    private List<ChmLzxBlock> lzxBlocksCache = null;
-    private ChmDirectoryListingSet chmDirList = null;
-    private ChmItsfHeader chmItsfHeader = null;
-    private ChmItspHeader chmItspHeader = null;
-    private ChmLzxcResetTable chmLzxcResetTable = null;
-    private ChmLzxcControlData chmLzxcControlData = null;
-    private byte[] data = null;
-    private int indexOfContent;
-    private long lzxBlockOffset;
-    private long lzxBlockLength;
-
-    /**
-     * Returns lzxc control data.
-     * 
-     * @return ChmLzxcControlData
-     */
-    private ChmLzxcControlData getChmLzxcControlData() {
-        return chmLzxcControlData;
-    }
-
-    /**
-     * Sets lzxc control data
-     * 
-     * @param chmLzxcControlData
-     */
-    private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
-        this.chmLzxcControlData = chmLzxcControlData;
-    }
-
-    private ChmItspHeader getChmItspHeader() {
-        return chmItspHeader;
-    }
-
-    private void setChmItspHeader(ChmItspHeader chmItspHeader) {
-        this.chmItspHeader = chmItspHeader;
-    }
-
-    /**
-     * Returns lzxc reset table
-     * 
-     * @return ChmLzxcResetTable
-     */
-    private ChmLzxcResetTable getChmLzxcResetTable() {
-        return chmLzxcResetTable;
-    }
-
-    /**
-     * Sets lzxc reset table
-     * 
-     * @param chmLzxcResetTable
-     */
-    private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
-        this.chmLzxcResetTable = chmLzxcResetTable;
-    }
-
-    /**
-     * Returns lzxc hit_cache length
-     * 
-     * @return lzxBlockLength
-     */
-    private long getLzxBlockLength() {
-        return lzxBlockLength;
-    }
-
-    /**
-     * Sets lzxc hit_cache length
-     * 
-     * @param lzxBlockLength
-     */
-    private void setLzxBlockLength(long lzxBlockLength) {
-        this.lzxBlockLength = lzxBlockLength;
-    }
-
-    /**
-     * Returns lzxc hit_cache offset
-     * 
-     * @return lzxBlockOffset
-     */
-    private long getLzxBlockOffset() {
-        return lzxBlockOffset;
-    }
-
-    /**
-     * Sets lzxc hit_cache offset
-     */
-    private void setLzxBlockOffset(long lzxBlockOffset) {
-        this.lzxBlockOffset = lzxBlockOffset;
-    }
-
-    private int getIndexOfContent() {
-        return indexOfContent;
-    }
-
-    private void setIndexOfContent(int indexOfContent) {
-        this.indexOfContent = indexOfContent;
-    }
-
-    private byte[] getData() {
-        return data;
-    }
-
-    private void setData(byte[] data) {
-        this.data = data;
-    }
-
-    public ChmExtractor(InputStream is) throws TikaException, IOException {
-        ChmAssert.assertInputStreamNotNull(is);
-        try {
-            setData(IOUtils.toByteArray(is));
-
-            /* Creates and parses chm itsf header */
-            setChmItsfHeader(new ChmItsfHeader());
-            // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
-            // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
-            getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
-                            ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
-
-            /* Creates and parses chm itsp header */
-            setChmItspHeader(new ChmItspHeader());
-            // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
-            // getChmItsfHeader().getDirOffset(),
-            // (int) getChmItsfHeader().getDirOffset() +
-            // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
-            getChmItspHeader().parse(
-                    ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
-                            .getDirOffset(), (int) getChmItsfHeader().getDirOffset() + 
-                            ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
-
-            /* Creates instance of ChmDirListingContainer */
-            setChmDirList(new ChmDirectoryListingSet(getData(),
-                    getChmItsfHeader(), getChmItspHeader()));
-
-            int indexOfControlData = getChmDirList().getControlDataIndex();
-            int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
-                    ChmConstants.LZXC.getBytes(UTF_8));
-            byte[] dir_chunk = null;
-            if (indexOfResetData > 0)
-                dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData  
-                        + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
-            // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
-            // indexOfResetData
-            // +
-            // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
-
-            /* Creates and parses chm control data */
-            setChmLzxcControlData(new ChmLzxcControlData());
-            getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
-
-            int indexOfResetTable = getChmDirList().getResetTableIndex();
-            setChmLzxcResetTable(new ChmLzxcResetTable());
-
-            int startIndex = (int) getChmDirList().getDataOffset()
-                    + getChmDirList().getDirectoryListingEntryList()
-                            .get(indexOfResetTable).getOffset();
-
-            // assert startIndex < data.length
-            ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
-
-            // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
-            // +
-            // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
-            dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
-                            + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
-
-            getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
-
-            setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(), 
-                    ChmConstants.CONTENT));
-            setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset() 
-                    + getChmItsfHeader().getDataOffset()));
-            setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
-
-            setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
-
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
-
-    /**
-     * Enumerates chm entities
-     * 
-     * @return list of chm entities
-     */
-    public List<String> enumerateChm() {
-        List<String> listOfEntries = new ArrayList<String>();
-        for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
-            listOfEntries.add(directoryListingEntry.getName());
-        }
-        return listOfEntries;
-    }
-
-    /**
-     * Decompresses a chm entry
-     * 
-     * @param directoryListingEntry
-     * 
-     * @return decompressed data
-     * @throws TikaException 
-     */
-    public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
-        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-        ChmLzxBlock lzxBlock = null;
-        try {
-            /* UNCOMPRESSED type is easiest one */
-            if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
-                    && directoryListingEntry.getLength() > 0
-                    && !ChmCommons.hasSkip(directoryListingEntry)) {
-                int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
-                        .getOffset());
-                // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
-                // dataOffset + directoryListingEntry.getLength());
-                buffer.write(ChmCommons.copyOfRange(
-                        getData(), dataOffset,
-                        dataOffset + directoryListingEntry.getLength()));
-            } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
-                    && !ChmCommons.hasSkip(directoryListingEntry)) {
-                /* Gets a chm hit_cache info */
-                ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
-                        directoryListingEntry, (int) getChmLzxcResetTable()
-                                .getBlockLen(), getChmLzxcControlData());
-
-                int i = 0, start = 0, hit_cache = 0;
-
-                if ((getLzxBlockLength() < Integer.MAX_VALUE)
-                        && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
-                    // TODO: Improve the caching
-                    // caching ... = O(n^2) - depends on startBlock and endBlock
-                    start = -1;
-                    if (!getLzxBlocksCache().isEmpty()) {
-                        for (i = 0; i < getLzxBlocksCache().size(); i++) {
-                            //lzxBlock = getLzxBlocksCache().get(i);
-                            int bn = getLzxBlocksCache().get(i).getBlockNumber();
-                            for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
-                                if (bn == j) {
-                                    if (j > start) {
-                                        start = j;
-                                        hit_cache = i;
-                                    }
-                                }
-                            }
-                            if (start == bb.getStartBlock())
-                                break;
-                        }
-                    }
-
-//                    if (i == getLzxBlocksCache().size() && i == 0) {
-                    if (start<0) {
-                        start = bb.getIniBlock();
-
-                        byte[] dataSegment = ChmCommons.getChmBlockSegment(
-                                getData(),
-                                getChmLzxcResetTable(), start,
-                                (int) getLzxBlockOffset(),
-                                (int) getLzxBlockLength());
-
-                        lzxBlock = new ChmLzxBlock(start, dataSegment,
-                                getChmLzxcResetTable().getBlockLen(), null);
-
-                        getLzxBlocksCache().add(lzxBlock);
-                    } else {
-                        lzxBlock = getLzxBlocksCache().get(hit_cache);
-                    }
-
-                    for (i = start; i <= bb.getEndBlock();) {
-                        if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
-                            buffer.write(lzxBlock.getContent(
-                                    bb.getStartOffset(), bb.getEndOffset()));
-                            break;
-                        }
-
-                        if (i == bb.getStartBlock()) {
-                            buffer.write(lzxBlock.getContent(
-                                    bb.getStartOffset()));
-                        }
-
-                        if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
-                            buffer.write(lzxBlock.getContent());
-                        }
-
-                        if (i == bb.getEndBlock()) {
-                            buffer.write(lzxBlock.getContent(
-                                    0, bb.getEndOffset()));
-                            break;
-                        }
-
-                        i++;
-
-                        if (i % getChmLzxcControlData().getResetInterval() == 0) {
-                            lzxBlock = new ChmLzxBlock(i,
-                                    ChmCommons.getChmBlockSegment(getData(),
-                                            getChmLzxcResetTable(), i,
-                                            (int) getLzxBlockOffset(),
-                                            (int) getLzxBlockLength()),
-                                    getChmLzxcResetTable().getBlockLen(), null);
-                        } else {
-                            lzxBlock = new ChmLzxBlock(i,
-                                    ChmCommons.getChmBlockSegment(getData(),
-                                            getChmLzxcResetTable(), i,
-                                            (int) getLzxBlockOffset(),
-                                            (int) getLzxBlockLength()),
-                                    getChmLzxcResetTable().getBlockLen(),
-                                    lzxBlock);
-                        }
-
-                        getLzxBlocksCache().add(lzxBlock);
-                    }
-
-                    if (getLzxBlocksCache().size() > getChmLzxcResetTable()
-                            .getBlockCount()) {
-                        getLzxBlocksCache().clear();
-                    }
-                } //end of if
-                
-                if (buffer.size() != directoryListingEntry.getLength()) {
-                    throw new TikaException("CHM file extract error: extracted Length is wrong.");
-                }
-            } //end of if compressed
-        } catch (Exception e) {
-            throw new TikaException(e.getMessage());
-        }
-
-        return buffer.toByteArray();
-    }
-
-    private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
-        this.lzxBlocksCache = lzxBlocksCache;
-    }
-
-    private List<ChmLzxBlock> getLzxBlocksCache() {
-        return lzxBlocksCache;
-    }
-
-    private void setChmDirList(ChmDirectoryListingSet chmDirList) {
-        this.chmDirList = chmDirList;
-    }
-
-    public ChmDirectoryListingSet getChmDirList() {
-        return chmDirList;
-    }
-
-    private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
-        this.chmItsfHeader = chmItsfHeader;
-    }
-
-    private ChmItsfHeader getChmItsfHeader() {
-        return chmItsfHeader;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Extracts text from chm file. Enumerates chm entries.
+ */
+public class ChmExtractor {
+    private List<ChmLzxBlock> lzxBlocksCache = null;
+    private ChmDirectoryListingSet chmDirList = null;
+    private ChmItsfHeader chmItsfHeader = null;
+    private ChmItspHeader chmItspHeader = null;
+    private ChmLzxcResetTable chmLzxcResetTable = null;
+    private ChmLzxcControlData chmLzxcControlData = null;
+    private byte[] data = null;
+    private int indexOfContent;
+    private long lzxBlockOffset;
+    private long lzxBlockLength;
+
+    /**
+     * Returns lzxc control data.
+     * 
+     * @return ChmLzxcControlData
+     */
+    private ChmLzxcControlData getChmLzxcControlData() {
+        return chmLzxcControlData;
+    }
+
+    /**
+     * Sets lzxc control data
+     * 
+     * @param chmLzxcControlData
+     */
+    private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+        this.chmLzxcControlData = chmLzxcControlData;
+    }
+
+    private ChmItspHeader getChmItspHeader() {
+        return chmItspHeader;
+    }
+
+    private void setChmItspHeader(ChmItspHeader chmItspHeader) {
+        this.chmItspHeader = chmItspHeader;
+    }
+
+    /**
+     * Returns lzxc reset table
+     * 
+     * @return ChmLzxcResetTable
+     */
+    private ChmLzxcResetTable getChmLzxcResetTable() {
+        return chmLzxcResetTable;
+    }
+
+    /**
+     * Sets lzxc reset table
+     * 
+     * @param chmLzxcResetTable
+     */
+    private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+        this.chmLzxcResetTable = chmLzxcResetTable;
+    }
+
+    /**
+     * Returns lzxc hit_cache length
+     * 
+     * @return lzxBlockLength
+     */
+    private long getLzxBlockLength() {
+        return lzxBlockLength;
+    }
+
+    /**
+     * Sets lzxc hit_cache length
+     * 
+     * @param lzxBlockLength
+     */
+    private void setLzxBlockLength(long lzxBlockLength) {
+        this.lzxBlockLength = lzxBlockLength;
+    }
+
+    /**
+     * Returns lzxc hit_cache offset
+     * 
+     * @return lzxBlockOffset
+     */
+    private long getLzxBlockOffset() {
+        return lzxBlockOffset;
+    }
+
+    /**
+     * Sets lzxc hit_cache offset
+     */
+    private void setLzxBlockOffset(long lzxBlockOffset) {
+        this.lzxBlockOffset = lzxBlockOffset;
+    }
+
+    private int getIndexOfContent() {
+        return indexOfContent;
+    }
+
+    private void setIndexOfContent(int indexOfContent) {
+        this.indexOfContent = indexOfContent;
+    }
+
+    private byte[] getData() {
+        return data;
+    }
+
+    private void setData(byte[] data) {
+        this.data = data;
+    }
+
+    public ChmExtractor(InputStream is) throws TikaException, IOException {
+        ChmAssert.assertInputStreamNotNull(is);
+        try {
+            setData(IOUtils.toByteArray(is));
+
+            /* Creates and parses chm itsf header */
+            setChmItsfHeader(new ChmItsfHeader());
+            // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
+            // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+            getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
+                            ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+
+            /* Creates and parses chm itsp header */
+            setChmItspHeader(new ChmItspHeader());
+            // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
+            // getChmItsfHeader().getDirOffset(),
+            // (int) getChmItsfHeader().getDirOffset() +
+            // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+            getChmItspHeader().parse(
+                    ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
+                            .getDirOffset(), (int) getChmItsfHeader().getDirOffset() + 
+                            ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+
+            /* Creates instance of ChmDirListingContainer */
+            setChmDirList(new ChmDirectoryListingSet(getData(),
+                    getChmItsfHeader(), getChmItspHeader()));
+
+            int indexOfControlData = getChmDirList().getControlDataIndex();
+            int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
+                    ChmConstants.LZXC.getBytes(UTF_8));
+            byte[] dir_chunk = null;
+            if (indexOfResetData > 0)
+                dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData  
+                        + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
+            // indexOfResetData
+            // +
+            // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+
+            /* Creates and parses chm control data */
+            setChmLzxcControlData(new ChmLzxcControlData());
+            getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
+
+            int indexOfResetTable = getChmDirList().getResetTableIndex();
+            setChmLzxcResetTable(new ChmLzxcResetTable());
+
+            int startIndex = (int) getChmDirList().getDataOffset()
+                    + getChmDirList().getDirectoryListingEntryList()
+                            .get(indexOfResetTable).getOffset();
+
+            // assert startIndex < data.length
+            ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
+
+            // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
+            // +
+            // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+            dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
+                            + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+
+            getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
+
+            setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(), 
+                    ChmConstants.CONTENT));
+            setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset() 
+                    + getChmItsfHeader().getDataOffset()));
+            setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
+
+            setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
+
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    /**
+     * Enumerates chm entities
+     * 
+     * @return list of chm entities
+     */
+    public List<String> enumerateChm() {
+        List<String> listOfEntries = new ArrayList<String>();
+        for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
+            listOfEntries.add(directoryListingEntry.getName());
+        }
+        return listOfEntries;
+    }
+
+    /**
+     * Decompresses a chm entry
+     * 
+     * @param directoryListingEntry
+     * 
+     * @return decompressed data
+     * @throws TikaException 
+     */
+    public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+        ChmLzxBlock lzxBlock = null;
+        try {
+            /* UNCOMPRESSED type is easiest one */
+            if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
+                    && directoryListingEntry.getLength() > 0
+                    && !ChmCommons.hasSkip(directoryListingEntry)) {
+                int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
+                        .getOffset());
+                // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
+                // dataOffset + directoryListingEntry.getLength());
+                buffer.write(ChmCommons.copyOfRange(
+                        getData(), dataOffset,
+                        dataOffset + directoryListingEntry.getLength()));
+            } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
+                    && !ChmCommons.hasSkip(directoryListingEntry)) {
+                /* Gets a chm hit_cache info */
+                ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
+                        directoryListingEntry, (int) getChmLzxcResetTable()
+                                .getBlockLen(), getChmLzxcControlData());
+
+                int i = 0, start = 0, hit_cache = 0;
+
+                if ((getLzxBlockLength() < Integer.MAX_VALUE)
+                        && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
+                    // TODO: Improve the caching
+                    // caching ... = O(n^2) - depends on startBlock and endBlock
+                    start = -1;
+                    if (!getLzxBlocksCache().isEmpty()) {
+                        for (i = 0; i < getLzxBlocksCache().size(); i++) {
+                            //lzxBlock = getLzxBlocksCache().get(i);
+                            int bn = getLzxBlocksCache().get(i).getBlockNumber();
+                            for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
+                                if (bn == j) {
+                                    if (j > start) {
+                                        start = j;
+                                        hit_cache = i;
+                                    }
+                                }
+                            }
+                            if (start == bb.getStartBlock())
+                                break;
+                        }
+                    }
+
+//                    if (i == getLzxBlocksCache().size() && i == 0) {
+                    if (start<0) {
+                        start = bb.getIniBlock();
+
+                        byte[] dataSegment = ChmCommons.getChmBlockSegment(
+                                getData(),
+                                getChmLzxcResetTable(), start,
+                                (int) getLzxBlockOffset(),
+                                (int) getLzxBlockLength());
+
+                        lzxBlock = new ChmLzxBlock(start, dataSegment,
+                                getChmLzxcResetTable().getBlockLen(), null);
+
+                        getLzxBlocksCache().add(lzxBlock);
+                    } else {
+                        lzxBlock = getLzxBlocksCache().get(hit_cache);
+                    }
+
+                    for (i = start; i <= bb.getEndBlock();) {
+                        if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
+                            buffer.write(lzxBlock.getContent(
+                                    bb.getStartOffset(), bb.getEndOffset()));
+                            break;
+                        }
+
+                        if (i == bb.getStartBlock()) {
+                            buffer.write(lzxBlock.getContent(
+                                    bb.getStartOffset()));
+                        }
+
+                        if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
+                            buffer.write(lzxBlock.getContent());
+                        }
+
+                        if (i == bb.getEndBlock()) {
+                            buffer.write(lzxBlock.getContent(
+                                    0, bb.getEndOffset()));
+                            break;
+                        }
+
+                        i++;
+
+                        if (i % getChmLzxcControlData().getResetInterval() == 0) {
+                            lzxBlock = new ChmLzxBlock(i,
+                                    ChmCommons.getChmBlockSegment(getData(),
+                                            getChmLzxcResetTable(), i,
+                                            (int) getLzxBlockOffset(),
+                                            (int) getLzxBlockLength()),
+                                    getChmLzxcResetTable().getBlockLen(), null);
+                        } else {
+                            lzxBlock = new ChmLzxBlock(i,
+                                    ChmCommons.getChmBlockSegment(getData(),
+                                            getChmLzxcResetTable(), i,
+                                            (int) getLzxBlockOffset(),
+                                            (int) getLzxBlockLength()),
+                                    getChmLzxcResetTable().getBlockLen(),
+                                    lzxBlock);
+                        }
+
+                        getLzxBlocksCache().add(lzxBlock);
+                    }
+
+                    if (getLzxBlocksCache().size() > getChmLzxcResetTable()
+                            .getBlockCount()) {
+                        getLzxBlocksCache().clear();
+                    }
+                } //end of if
+                
+                if (buffer.size() != directoryListingEntry.getLength()) {
+                    throw new TikaException("CHM file extract error: extracted Length is wrong.");
+                }
+            } //end of if compressed
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage());
+        }
+
+        return buffer.toByteArray();
+    }
+
+    private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+        this.lzxBlocksCache = lzxBlocksCache;
+    }
+
+    private List<ChmLzxBlock> getLzxBlocksCache() {
+        return lzxBlocksCache;
+    }
+
+    private void setChmDirList(ChmDirectoryListingSet chmDirList) {
+        this.chmDirList = chmDirList;
+    }
+
+    public ChmDirectoryListingSet getChmDirList() {
+        return chmDirList;
+    }
+
+    private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+        this.chmItsfHeader = chmItsfHeader;
+    }
+
+    private ChmItsfHeader getChmItsfHeader() {
+        return chmItsfHeader;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
index 03f81d3..9ed1898 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
@@ -1,147 +1,147 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm.core;
-
-import java.util.List;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-public class ChmWrapper {
-    private List<ChmLzxBlock> lzxBlocksCache = null;
-    private ChmDirectoryListingSet chmDirList = null;
-    private ChmItsfHeader chmItsfHeader = null;
-    private ChmItspHeader chmItspHeader = null;
-    private ChmLzxcResetTable chmLzxcResetTable = null;
-    private ChmLzxcControlData chmLzxcControlData = null;
-    private byte[] data = null;
-    private int indexOfContent;
-    private long lzxBlockOffset;
-    private long lzxBlockLength;
-    private int indexOfResetData;
-    private int indexOfResetTable;
-    private int startIndex;
-
-    protected int getStartIndex() {
-        return startIndex;
-    }
-
-    protected void setStartIndex(int startIndex) {
-        this.startIndex = startIndex;
-    }
-
-    protected int getIndexOfResetTable() {
-        return indexOfResetTable;
-    }
-
-    protected void setIndexOfResetTable(int indexOfResetTable) {
-        this.indexOfResetTable = indexOfResetTable;
-    }
-
-    protected List<ChmLzxBlock> getLzxBlocksCache() {
-        return lzxBlocksCache;
-    }
-
-    protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
-        this.lzxBlocksCache = lzxBlocksCache;
-    }
-
-    protected ChmDirectoryListingSet getChmDirList() {
-        return chmDirList;
-    }
-
-    protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
-        this.chmDirList = chmDirList;
-    }
-
-    protected ChmItsfHeader getChmItsfHeader() {
-        return chmItsfHeader;
-    }
-
-    protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
-        this.chmItsfHeader = chmItsfHeader;
-    }
-
-    protected ChmLzxcResetTable getChmLzxcResetTable() {
-        return chmLzxcResetTable;
-    }
-
-    protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
-        this.chmLzxcResetTable = chmLzxcResetTable;
-    }
-
-    protected ChmLzxcControlData getChmLzxcControlData() {
-        return chmLzxcControlData;
-    }
-
-    protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
-        this.chmLzxcControlData = chmLzxcControlData;
-    }
-
-    protected byte[] getData() {
-        return data;
-    }
-
-    protected void setData(byte[] data) {
-        this.data = data;
-    }
-
-    protected int getIndexOfContent() {
-        return indexOfContent;
-    }
-
-    protected void setIndexOfContent(int indexOfContent) {
-        this.indexOfContent = indexOfContent;
-    }
-
-    protected long getLzxBlockOffset() {
-        return lzxBlockOffset;
-    }
-
-    protected void setLzxBlockOffset(long lzxBlockOffset) {
-        this.lzxBlockOffset = lzxBlockOffset;
-    }
-
-    protected long getLzxBlockLength() {
-        return lzxBlockLength;
-    }
-
-    protected void setLzxBlockLength(long lzxBlockLength) {
-        this.lzxBlockLength = lzxBlockLength;
-    }
-
-    protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
-        this.chmItspHeader = chmItspHeader;
-    }
-
-    protected ChmItspHeader getChmItspHeader() {
-        return chmItspHeader;
-    }
-
-    protected void setIndexOfResetData(int indexOfResetData) {
-        this.indexOfResetData = indexOfResetData;
-    }
-
-    protected int getIndexOfResetData() {
-        return indexOfResetData;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm.core;
+
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+public class ChmWrapper {
+    private List<ChmLzxBlock> lzxBlocksCache = null;
+    private ChmDirectoryListingSet chmDirList = null;
+    private ChmItsfHeader chmItsfHeader = null;
+    private ChmItspHeader chmItspHeader = null;
+    private ChmLzxcResetTable chmLzxcResetTable = null;
+    private ChmLzxcControlData chmLzxcControlData = null;
+    private byte[] data = null;
+    private int indexOfContent;
+    private long lzxBlockOffset;
+    private long lzxBlockLength;
+    private int indexOfResetData;
+    private int indexOfResetTable;
+    private int startIndex;
+
+    protected int getStartIndex() {
+        return startIndex;
+    }
+
+    protected void setStartIndex(int startIndex) {
+        this.startIndex = startIndex;
+    }
+
+    protected int getIndexOfResetTable() {
+        return indexOfResetTable;
+    }
+
+    protected void setIndexOfResetTable(int indexOfResetTable) {
+        this.indexOfResetTable = indexOfResetTable;
+    }
+
+    protected List<ChmLzxBlock> getLzxBlocksCache() {
+        return lzxBlocksCache;
+    }
+
+    protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+        this.lzxBlocksCache = lzxBlocksCache;
+    }
+
+    protected ChmDirectoryListingSet getChmDirList() {
+        return chmDirList;
+    }
+
+    protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
+        this.chmDirList = chmDirList;
+    }
+
+    protected ChmItsfHeader getChmItsfHeader() {
+        return chmItsfHeader;
+    }
+
+    protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+        this.chmItsfHeader = chmItsfHeader;
+    }
+
+    protected ChmLzxcResetTable getChmLzxcResetTable() {
+        return chmLzxcResetTable;
+    }
+
+    protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+        this.chmLzxcResetTable = chmLzxcResetTable;
+    }
+
+    protected ChmLzxcControlData getChmLzxcControlData() {
+        return chmLzxcControlData;
+    }
+
+    protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+        this.chmLzxcControlData = chmLzxcControlData;
+    }
+
+    protected byte[] getData() {
+        return data;
+    }
+
+    protected void setData(byte[] data) {
+        this.data = data;
+    }
+
+    protected int getIndexOfContent() {
+        return indexOfContent;
+    }
+
+    protected void setIndexOfContent(int indexOfContent) {
+        this.indexOfContent = indexOfContent;
+    }
+
+    protected long getLzxBlockOffset() {
+        return lzxBlockOffset;
+    }
+
+    protected void setLzxBlockOffset(long lzxBlockOffset) {
+        this.lzxBlockOffset = lzxBlockOffset;
+    }
+
+    protected long getLzxBlockLength() {
+        return lzxBlockLength;
+    }
+
+    protected void setLzxBlockLength(long lzxBlockLength) {
+        this.lzxBlockLength = lzxBlockLength;
+    }
+
+    protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
+        this.chmItspHeader = chmItspHeader;
+    }
+
+    protected ChmItspHeader getChmItspHeader() {
+        return chmItspHeader;
+    }
+
+    protected void setIndexOfResetData(int indexOfResetData) {
+        this.indexOfResetData = indexOfResetData;
+    }
+
+    protected int getIndexOfResetData() {
+        return indexOfResetData;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
index fbed908..46c522b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.exception;
-
-import org.apache.tika.exception.TikaException;
-
-public class ChmParsingException extends TikaException {
-    private static final long serialVersionUID = 6497936044733665210L;
-
-    public ChmParsingException(String description) {
-        super(description);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.exception;
+
+import org.apache.tika.exception.TikaException;
+
+public class ChmParsingException extends TikaException {
+    private static final long serialVersionUID = 6497936044733665210L;
+
+    public ChmParsingException(String description) {
+        super(description);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
index 7f7564d..cda829c 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
@@ -1,235 +1,235 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * A container that contains chm block information such as: i. initial block is
- * using to reset main tree ii. start block is using for knowing where to start
- * iii. end block is using for knowing where to stop iv. start offset is using
- * for knowing where to start reading v. end offset is using for knowing where
- * to stop reading
- * 
- */
-public class ChmBlockInfo {
-    /* class members */
-    private int iniBlock;
-    private int startBlock;
-    private int endBlock;
-    private int startOffset;
-    private int endOffset;
-
-    private static ChmBlockInfo chmBlockInfo = null;
-
-    private ChmBlockInfo() {
-
-    }
-
-    /**
-     * Returns an information related to the chmBlockInfo
-     * 
-     * @param dle
-     *            - DirectoryListingEntry
-     * @param bytesPerBlock
-     *            - int, = chmLzxcResetTable.block_length
-     * @param clcd
-     *            - ChmLzxcControlData
-     * @param chmBlockInfo
-     *            - ChmBlockInfo
-     * 
-     * @return ChmBlockInfo
-     * @throws TikaException 
-     */
-    protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
-            int bytesPerBlock, ChmLzxcControlData clcd,
-            ChmBlockInfo chmBlockInfo) throws TikaException {
-        if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
-            throw new ChmParsingException("Please check you parameters");
-
-        chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
-        chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
-                / bytesPerBlock);
-        chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
-        chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
-                % bytesPerBlock);
-        // potential problem with casting long to int
-        chmBlockInfo
-                .setIniBlock(chmBlockInfo.startBlock - 
-                        chmBlockInfo.startBlock % (int) clcd.getResetInterval());
-//                .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
-//                        % (int) clcd.getResetInterval());
-        return chmBlockInfo;
-    }
-
-    public static ChmBlockInfo getChmBlockInfoInstance(
-            DirectoryListingEntry dle, int bytesPerBlock,
-            ChmLzxcControlData clcd) {
-        setChmBlockInfo(new ChmBlockInfo());
-        getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
-        getChmBlockInfo().setEndBlock(
-                (dle.getOffset() + dle.getLength()) / bytesPerBlock);
-        getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
-        getChmBlockInfo().setEndOffset(
-                (dle.getOffset() + dle.getLength()) % bytesPerBlock);
-        // potential problem with casting long to int
-        getChmBlockInfo().setIniBlock(
-                getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
-                        % (int) clcd.getResetInterval());
-//                (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
-//                        % (int) clcd.getResetInterval());
-        return getChmBlockInfo();
-    }
-
-    /**
-     * Returns textual representation of ChmBlockInfo
-     */
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("iniBlock:=" + getIniBlock() + ", ");
-        sb.append("startBlock:=" + getStartBlock() + ", ");
-        sb.append("endBlock:=" + getEndBlock() + ", ");
-        sb.append("startOffset:=" + getStartOffset() + ", ");
-        sb.append("endOffset:=" + getEndOffset()
-                + System.getProperty("line.separator"));
-        return sb.toString();
-    }
-
-    private boolean validateParameters(DirectoryListingEntry dle,
-            int bytesPerBlock, ChmLzxcControlData clcd,
-            ChmBlockInfo chmBlockInfo) {
-        int goodParameter = 0;
-        if (dle != null)
-            ++goodParameter;
-        if (bytesPerBlock > 0)
-            ++goodParameter;
-        if (clcd != null)
-            ++goodParameter;
-        if (chmBlockInfo != null)
-            ++goodParameter;
-        return (goodParameter == 4);
-    }
-
-    public static void main(String[] args) {
-    }
-
-    /**
-     * Returns an initial block index
-     * 
-     * @return int
-     */
-    public int getIniBlock() {
-        return iniBlock;
-    }
-
-    /**
-     * Sets the initial block index
-     * 
-     * @param iniBlock
-     *            - int
-     */
-    private void setIniBlock(int iniBlock) {
-        this.iniBlock = iniBlock;
-    }
-
-    /**
-     * Returns the start block index
-     * 
-     * @return int
-     */
-    public int getStartBlock() {
-        return startBlock;
-    }
-
-    /**
-     * Sets the start block index
-     * 
-     * @param startBlock
-     *            - int
-     */
-    private void setStartBlock(int startBlock) {
-        this.startBlock = startBlock;
-    }
-
-    /**
-     * Returns the end block index
-     * 
-     * @return - int
-     */
-    public int getEndBlock() {
-        return endBlock;
-    }
-
-    /**
-     * Sets the end block index
-     * 
-     * @param endBlock
-     *            - int
-     */
-    private void setEndBlock(int endBlock) {
-        this.endBlock = endBlock;
-    }
-
-    /**
-     * Returns the start offset index
-     * 
-     * @return - int
-     */
-    public int getStartOffset() {
-        return startOffset;
-    }
-
-    /**
-     * Sets the start offset index
-     * 
-     * @param startOffset
-     *            - int
-     */
-    private void setStartOffset(int startOffset) {
-        this.startOffset = startOffset;
-    }
-
-    /**
-     * Returns the end offset index
-     * 
-     * @return - int
-     */
-    public int getEndOffset() {
-        return endOffset;
-    }
-
-    /**
-     * Sets the end offset index
-     * 
-     * @param endOffset
-     *            - int
-     */
-    private void setEndOffset(int endOffset) {
-        this.endOffset = endOffset;
-    }
-
-    public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
-        ChmBlockInfo.chmBlockInfo = chmBlockInfo;
-    }
-
-    public static ChmBlockInfo getChmBlockInfo() {
-        return chmBlockInfo;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * A container that contains chm block information such as: i. initial block is
+ * using to reset main tree ii. start block is using for knowing where to start
+ * iii. end block is using for knowing where to stop iv. start offset is using
+ * for knowing where to start reading v. end offset is using for knowing where
+ * to stop reading
+ * 
+ */
+public class ChmBlockInfo {
+    /* class members */
+    private int iniBlock;
+    private int startBlock;
+    private int endBlock;
+    private int startOffset;
+    private int endOffset;
+
+    private static ChmBlockInfo chmBlockInfo = null;
+
+    private ChmBlockInfo() {
+
+    }
+
+    /**
+     * Returns an information related to the chmBlockInfo
+     * 
+     * @param dle
+     *            - DirectoryListingEntry
+     * @param bytesPerBlock
+     *            - int, = chmLzxcResetTable.block_length
+     * @param clcd
+     *            - ChmLzxcControlData
+     * @param chmBlockInfo
+     *            - ChmBlockInfo
+     * 
+     * @return ChmBlockInfo
+     * @throws TikaException 
+     */
+    protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
+            int bytesPerBlock, ChmLzxcControlData clcd,
+            ChmBlockInfo chmBlockInfo) throws TikaException {
+        if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
+            throw new ChmParsingException("Please check you parameters");
+
+        chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
+        chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
+                / bytesPerBlock);
+        chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
+        chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
+                % bytesPerBlock);
+        // potential problem with casting long to int
+        chmBlockInfo
+                .setIniBlock(chmBlockInfo.startBlock - 
+                        chmBlockInfo.startBlock % (int) clcd.getResetInterval());
+//                .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
+//                        % (int) clcd.getResetInterval());
+        return chmBlockInfo;
+    }
+
+    public static ChmBlockInfo getChmBlockInfoInstance(
+            DirectoryListingEntry dle, int bytesPerBlock,
+            ChmLzxcControlData clcd) {
+        setChmBlockInfo(new ChmBlockInfo());
+        getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
+        getChmBlockInfo().setEndBlock(
+                (dle.getOffset() + dle.getLength()) / bytesPerBlock);
+        getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
+        getChmBlockInfo().setEndOffset(
+                (dle.getOffset() + dle.getLength()) % bytesPerBlock);
+        // potential problem with casting long to int
+        getChmBlockInfo().setIniBlock(
+                getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
+                        % (int) clcd.getResetInterval());
+//                (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
+//                        % (int) clcd.getResetInterval());
+        return getChmBlockInfo();
+    }
+
+    /**
+     * Returns textual representation of ChmBlockInfo
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("iniBlock:=" + getIniBlock() + ", ");
+        sb.append("startBlock:=" + getStartBlock() + ", ");
+        sb.append("endBlock:=" + getEndBlock() + ", ");
+        sb.append("startOffset:=" + getStartOffset() + ", ");
+        sb.append("endOffset:=" + getEndOffset()
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    private boolean validateParameters(DirectoryListingEntry dle,
+            int bytesPerBlock, ChmLzxcControlData clcd,
+            ChmBlockInfo chmBlockInfo) {
+        int goodParameter = 0;
+        if (dle != null)
+            ++goodParameter;
+        if (bytesPerBlock > 0)
+            ++goodParameter;
+        if (clcd != null)
+            ++goodParameter;
+        if (chmBlockInfo != null)
+            ++goodParameter;
+        return (goodParameter == 4);
+    }
+
+    public static void main(String[] args) {
+    }
+
+    /**
+     * Returns an initial block index
+     * 
+     * @return int
+     */
+    public int getIniBlock() {
+        return iniBlock;
+    }
+
+    /**
+     * Sets the initial block index
+     * 
+     * @param iniBlock
+     *            - int
+     */
+    private void setIniBlock(int iniBlock) {
+        this.iniBlock = iniBlock;
+    }
+
+    /**
+     * Returns the start block index
+     * 
+     * @return int
+     */
+    public int getStartBlock() {
+        return startBlock;
+    }
+
+    /**
+     * Sets the start block index
+     * 
+     * @param startBlock
+     *            - int
+     */
+    private void setStartBlock(int startBlock) {
+        this.startBlock = startBlock;
+    }
+
+    /**
+     * Returns the end block index
+     * 
+     * @return - int
+     */
+    public int getEndBlock() {
+        return endBlock;
+    }
+
+    /**
+     * Sets the end block index
+     * 
+     * @param endBlock
+     *            - int
+     */
+    private void setEndBlock(int endBlock) {
+        this.endBlock = endBlock;
+    }
+
+    /**
+     * Returns the start offset index
+     * 
+     * @return - int
+     */
+    public int getStartOffset() {
+        return startOffset;
+    }
+
+    /**
+     * Sets the start offset index
+     * 
+     * @param startOffset
+     *            - int
+     */
+    private void setStartOffset(int startOffset) {
+        this.startOffset = startOffset;
+    }
+
+    /**
+     * Returns the end offset index
+     * 
+     * @return - int
+     */
+    public int getEndOffset() {
+        return endOffset;
+    }
+
+    /**
+     * Sets the end offset index
+     * 
+     * @param endOffset
+     *            - int
+     */
+    private void setEndOffset(int endOffset) {
+        this.endOffset = endOffset;
+    }
+
+    public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
+        ChmBlockInfo.chmBlockInfo = chmBlockInfo;
+    }
+
+    public static ChmBlockInfo getChmBlockInfo() {
+        return chmBlockInfo;
+    }
+}

[31/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
index 53bf241..3b79f31 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
@@ -1,246 +1,246 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TailStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
- * from an MP3 file, if available.
- *
- * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
- */
-public class Mp3Parser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 8537074922934844370L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.audio("mpeg"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
-        metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        // Create handlers for the various kinds of ID3 tags
-        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
-
-        // Process tags metadata if the file has supported tags
-        if (audioAndTags.tags.length > 0) {
-           CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
-
-           metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
-           metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
-           metadata.set(XMPDM.ARTIST, tag.getArtist());
-           metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
-           metadata.set(XMPDM.COMPOSER, tag.getComposer());
-           metadata.set(XMPDM.ALBUM, tag.getAlbum());
-           metadata.set(XMPDM.COMPILATION, tag.getCompilation());
-           metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
-           metadata.set(XMPDM.GENRE, tag.getGenre());
-
-           List<String> comments = new ArrayList<String>();
-           for (ID3Comment comment : tag.getComments()) {
-              StringBuffer cmt = new StringBuffer();
-              if (comment.getLanguage() != null) {
-                 cmt.append(comment.getLanguage());
-                 cmt.append(" - ");
-              }
-              if (comment.getDescription() != null) {
-                 cmt.append(comment.getDescription());
-                 if (comment.getText() != null) {
-                    cmt.append("\n");
-                 }
-              }
-              if (comment.getText() != null) {
-                 cmt.append(comment.getText());
-              }
-              
-              comments.add(cmt.toString());
-              metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
-           }
-
-           xhtml.element("h1", tag.getTitle());
-           xhtml.element("p", tag.getArtist());
-
-            // ID3v1.1 Track addition
-            StringBuilder sb = new StringBuilder();
-            sb.append(tag.getAlbum());
-            if (tag.getTrackNumber() != null) {
-                sb.append(", track ").append(tag.getTrackNumber());
-                metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
-            }
-            if (tag.getDisc() != null) {
-                sb.append(", disc ").append(tag.getDisc());
-                metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
-            }
-            xhtml.element("p", sb.toString());
-            
-            xhtml.element("p", tag.getYear());
-            xhtml.element("p", tag.getGenre());
-            xhtml.element("p", String.valueOf(audioAndTags.duration));
-            for (String comment : comments) {
-               xhtml.element("p", comment);
-            }
-        }
-        if (audioAndTags.duration > 0) {
-            metadata.set(XMPDM.DURATION, audioAndTags.duration);
-        }
-        if (audioAndTags.audio != null) {
-            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
-            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
-            metadata.set("version", audioAndTags.audio.getVersion());
-            
-            metadata.set(
-                    XMPDM.AUDIO_SAMPLE_RATE,
-                    Integer.toString(audioAndTags.audio.getSampleRate()));
-            if(audioAndTags.audio.getChannels() == 1) {
-               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
-            } else if(audioAndTags.audio.getChannels() == 2) {
-               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
-            } else if(audioAndTags.audio.getChannels() == 5) {
-               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
-            } else if(audioAndTags.audio.getChannels() == 7) {
-               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
-            }
-        }
-        if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
-           xhtml.startElement("p", "class", "lyrics");
-           xhtml.characters(audioAndTags.lyrics.lyricsText);
-           xhtml.endElement("p");
-        }
-
-        xhtml.endDocument();
-    }
-
-    /**
-     * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
-     *  for each supported set of tags. 
-     */
-    protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
-           throws IOException, SAXException, TikaException {
-       ID3v24Handler v24 = null;
-       ID3v23Handler v23 = null;
-       ID3v22Handler v22 = null;
-       ID3v1Handler v1 = null;
-       LyricsHandler lyrics = null;
-       AudioFrame firstAudio = null;
-
-       TailStream tailStream = new TailStream(stream, 10240+128);
-       MpegStream mpegStream = new MpegStream(tailStream);
-
-       // ID3v2 tags live at the start of the file
-       // You can apparently have several different ID3 tag blocks
-       // So, keep going until we don't find any more
-       MP3Frame f;
-       while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
-           if(f instanceof ID3v2Frame) {
-               ID3v2Frame id3F = (ID3v2Frame)f;
-               if (id3F.getMajorVersion() == 4) {
-                   v24 = new ID3v24Handler(id3F);
-               } else if(id3F.getMajorVersion() == 3) {
-                   v23 = new ID3v23Handler(id3F);
-               } else if(id3F.getMajorVersion() == 2) {
-                   v22 = new ID3v22Handler(id3F);
-               }
-           }
-       }
-
-        // Now iterate over all audio frames in the file
-        AudioFrame frame = mpegStream.nextFrame();
-        float duration = 0;
-        while (frame != null)
-        {
-            duration += frame.getDuration();
-            if (firstAudio == null)
-            {
-                firstAudio = frame;
-            }
-            mpegStream.skipFrame();
-            frame = mpegStream.nextFrame();
-        }
-
-       // ID3v1 tags live at the end of the file
-       // Lyrics live just before ID3v1, at the end of the file
-       // Search for both (handlers seek to the end for us)
-       lyrics = new LyricsHandler(tailStream.getTail());
-       v1 = lyrics.id3v1;
-
-       // Go in order of preference
-       // Currently, that's newest to oldest
-       List<ID3Tags> tags = new ArrayList<ID3Tags>();
-
-       if(v24 != null && v24.getTagsPresent()) {
-          tags.add(v24);
-       }
-       if(v23 != null && v23.getTagsPresent()) {
-          tags.add(v23);
-       }
-       if(v22 != null && v22.getTagsPresent()) {
-          tags.add(v22);
-       }
-       if(v1 != null && v1.getTagsPresent()) {
-          tags.add(v1);
-       }
-       
-       ID3TagsAndAudio ret = new ID3TagsAndAudio();
-       ret.audio = firstAudio;
-       ret.lyrics = lyrics;
-       ret.tags = tags.toArray(new ID3Tags[tags.size()]);
-       ret.duration = duration;
-       return ret;
-    }
-
-    protected static class ID3TagsAndAudio {
-        private ID3Tags[] tags;
-        private AudioFrame audio;
-        private LyricsHandler lyrics;
-        private float duration;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TailStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
+ * from an MP3 file, if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
+ */
+public class Mp3Parser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 8537074922934844370L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.audio("mpeg"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
+        metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // Create handlers for the various kinds of ID3 tags
+        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+        // Process tags metadata if the file has supported tags
+        if (audioAndTags.tags.length > 0) {
+           CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
+
+           metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
+           metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
+           metadata.set(XMPDM.ARTIST, tag.getArtist());
+           metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
+           metadata.set(XMPDM.COMPOSER, tag.getComposer());
+           metadata.set(XMPDM.ALBUM, tag.getAlbum());
+           metadata.set(XMPDM.COMPILATION, tag.getCompilation());
+           metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
+           metadata.set(XMPDM.GENRE, tag.getGenre());
+
+           List<String> comments = new ArrayList<String>();
+           for (ID3Comment comment : tag.getComments()) {
+              StringBuffer cmt = new StringBuffer();
+              if (comment.getLanguage() != null) {
+                 cmt.append(comment.getLanguage());
+                 cmt.append(" - ");
+              }
+              if (comment.getDescription() != null) {
+                 cmt.append(comment.getDescription());
+                 if (comment.getText() != null) {
+                    cmt.append("\n");
+                 }
+              }
+              if (comment.getText() != null) {
+                 cmt.append(comment.getText());
+              }
+              
+              comments.add(cmt.toString());
+              metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
+           }
+
+           xhtml.element("h1", tag.getTitle());
+           xhtml.element("p", tag.getArtist());
+
+            // ID3v1.1 Track addition
+            StringBuilder sb = new StringBuilder();
+            sb.append(tag.getAlbum());
+            if (tag.getTrackNumber() != null) {
+                sb.append(", track ").append(tag.getTrackNumber());
+                metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
+            }
+            if (tag.getDisc() != null) {
+                sb.append(", disc ").append(tag.getDisc());
+                metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
+            }
+            xhtml.element("p", sb.toString());
+            
+            xhtml.element("p", tag.getYear());
+            xhtml.element("p", tag.getGenre());
+            xhtml.element("p", String.valueOf(audioAndTags.duration));
+            for (String comment : comments) {
+               xhtml.element("p", comment);
+            }
+        }
+        if (audioAndTags.duration > 0) {
+            metadata.set(XMPDM.DURATION, audioAndTags.duration);
+        }
+        if (audioAndTags.audio != null) {
+            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+            metadata.set("version", audioAndTags.audio.getVersion());
+            
+            metadata.set(
+                    XMPDM.AUDIO_SAMPLE_RATE,
+                    Integer.toString(audioAndTags.audio.getSampleRate()));
+            if(audioAndTags.audio.getChannels() == 1) {
+               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
+            } else if(audioAndTags.audio.getChannels() == 2) {
+               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
+            } else if(audioAndTags.audio.getChannels() == 5) {
+               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
+            } else if(audioAndTags.audio.getChannels() == 7) {
+               metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
+            }
+        }
+        if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+           xhtml.startElement("p", "class", "lyrics");
+           xhtml.characters(audioAndTags.lyrics.lyricsText);
+           xhtml.endElement("p");
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+     *  for each supported set of tags. 
+     */
+    protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
+           throws IOException, SAXException, TikaException {
+       ID3v24Handler v24 = null;
+       ID3v23Handler v23 = null;
+       ID3v22Handler v22 = null;
+       ID3v1Handler v1 = null;
+       LyricsHandler lyrics = null;
+       AudioFrame firstAudio = null;
+
+       TailStream tailStream = new TailStream(stream, 10240+128);
+       MpegStream mpegStream = new MpegStream(tailStream);
+
+       // ID3v2 tags live at the start of the file
+       // You can apparently have several different ID3 tag blocks
+       // So, keep going until we don't find any more
+       MP3Frame f;
+       while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
+           if(f instanceof ID3v2Frame) {
+               ID3v2Frame id3F = (ID3v2Frame)f;
+               if (id3F.getMajorVersion() == 4) {
+                   v24 = new ID3v24Handler(id3F);
+               } else if(id3F.getMajorVersion() == 3) {
+                   v23 = new ID3v23Handler(id3F);
+               } else if(id3F.getMajorVersion() == 2) {
+                   v22 = new ID3v22Handler(id3F);
+               }
+           }
+       }
+
+        // Now iterate over all audio frames in the file
+        AudioFrame frame = mpegStream.nextFrame();
+        float duration = 0;
+        while (frame != null)
+        {
+            duration += frame.getDuration();
+            if (firstAudio == null)
+            {
+                firstAudio = frame;
+            }
+            mpegStream.skipFrame();
+            frame = mpegStream.nextFrame();
+        }
+
+       // ID3v1 tags live at the end of the file
+       // Lyrics live just before ID3v1, at the end of the file
+       // Search for both (handlers seek to the end for us)
+       lyrics = new LyricsHandler(tailStream.getTail());
+       v1 = lyrics.id3v1;
+
+       // Go in order of preference
+       // Currently, that's newest to oldest
+       List<ID3Tags> tags = new ArrayList<ID3Tags>();
+
+       if(v24 != null && v24.getTagsPresent()) {
+          tags.add(v24);
+       }
+       if(v23 != null && v23.getTagsPresent()) {
+          tags.add(v23);
+       }
+       if(v22 != null && v22.getTagsPresent()) {
+          tags.add(v22);
+       }
+       if(v1 != null && v1.getTagsPresent()) {
+          tags.add(v1);
+       }
+       
+       ID3TagsAndAudio ret = new ID3TagsAndAudio();
+       ret.audio = firstAudio;
+       ret.lyrics = lyrics;
+       ret.tags = tags.toArray(new ID3Tags[tags.size()]);
+       ret.duration = duration;
+       return ret;
+    }
+
+    protected static class ID3TagsAndAudio {
+        private ID3Tags[] tags;
+        private AudioFrame audio;
+        private LyricsHandler lyrics;
+        private float duration;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
index 1a0b1b9..947b694 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -1,268 +1,268 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.video;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * <p>
- * Parser for metadata contained in Flash Videos (.flv). Resources:
- * http://osflash.org/flv and for AMF:
- * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
- * <p>
- * This parser is capable of extracting the general metadata from header as well
- * as embedded metadata.
- * <p>
- * Known keys for metadata (from file header):
- * <ol>
- * <li>hasVideo: true|false
- * <li>hasSound: true|false
- * </ol>
- * <p>
- * In addition to the above values also metadata that is inserted in to the
- * actual stream will be picked. Usually there are keys like:
- * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
- * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
- * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
- * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
- * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
- */
-public class FLVParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -8718013155719197679L;
-
-    private static int TYPE_METADATA = 0x12;
-    private static byte MASK_AUDIO = 1;
-    private static byte MASK_VIDEO = 4;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.video("x-flv"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    private long readUInt32(DataInputStream input) throws IOException {
-        return input.readInt() & 0xFFFFFFFFL;
-    }
-
-    private int readUInt24(DataInputStream input) throws IOException {
-        int uint = input.read()<<16;
-        uint += input.read()<<8;
-        uint += input.read(); 
-        return uint;
-    }
-
-    private Object readAMFData(DataInputStream input, int type)
-            throws IOException {
-        if (type == -1) {
-            type = input.readUnsignedByte();
-        }
-        switch (type) {
-        case 0:
-            return input.readDouble();
-        case 1:
-            return input.readUnsignedByte() == 1;
-        case 2:
-            return readAMFString(input);
-        case 3:
-            return readAMFObject(input);
-        case 8:
-            return readAMFEcmaArray(input);
-        case 10:
-            return readAMFStrictArray(input);
-        case 11:
-            final Date date = new Date((long) input.readDouble());
-            input.readShort(); // time zone
-            return date;
-        case 13:
-            return "UNDEFINED";
-        default:
-            return null;
-        }
-    }
-
-    private Object readAMFStrictArray(DataInputStream input) throws IOException {
-        long count = readUInt32(input);
-        ArrayList<Object> list = new ArrayList<Object>();
-        for (int i = 0; i < count; i++) {
-            list.add(readAMFData(input, -1));
-        }
-        return list;
-    }
-
-
-    private String readAMFString(DataInputStream input) throws IOException {
-        int size = input.readUnsignedShort();
-        byte[] chars = new byte[size];
-        input.readFully(chars);
-        return new String(chars, UTF_8);
-    }
-
-    private Object readAMFObject(DataInputStream input) throws IOException {
-        HashMap<String, Object> array = new HashMap<String, Object>();
-        while (true) {
-            String key = readAMFString(input);
-            int dataType = input.read();
-            if (dataType == 9) { // object end marker
-                break;
-            }
-            array.put(key, readAMFData(input, dataType));
-        }
-        return array;
-    }
-
-    private Object readAMFEcmaArray(DataInputStream input) throws IOException {
-        long size = readUInt32(input);
-        HashMap<String, Object> array = new HashMap<String, Object>();
-        for (int i = 0; i < size; i++) {
-            String key = readAMFString(input);
-            int dataType = input.read();
-            array.put(key, readAMFData(input, dataType));
-        }
-        return array;
-    }
-
-    private boolean checkSignature(DataInputStream fis) throws IOException {
-        return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        DataInputStream datainput = new DataInputStream(stream);
-        if (!checkSignature(datainput)) {
-            throw new TikaException("FLV signature not detected");
-        }
-
-        // header
-        int version = datainput.readUnsignedByte();
-        if (version != 1) {
-            // should be 1, perhaps this is not flv?
-            throw new TikaException("Unpexpected FLV version: " + version);
-        }
-
-        int typeFlags = datainput.readUnsignedByte();
-
-        long len = readUInt32(datainput);
-        if (len != 9) {
-            // we only know about format with header of 9 bytes
-            throw new TikaException("Unpexpected FLV header length: " + len);
-        }
-
-        long sizePrev = readUInt32(datainput);
-        if (sizePrev != 0) {
-            // should be 0, perhaps this is not flv?
-            throw new TikaException(
-                    "Unpexpected FLV first previous block size: " + sizePrev);
-        }
-
-        metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
-        metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
-        metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        // flv tag stream follows...
-        while (true) {
-            int type = datainput.read();
-            if (type == -1) {
-                // EOF
-                break;
-            }
-
-            int datalen = readUInt24(datainput); //body length
-            readUInt32(datainput); // timestamp
-            readUInt24(datainput); // streamid
-
-            if (type == TYPE_METADATA) {
-                // found metadata Tag, read content to buffer
-                byte[] metaBytes = new byte[datalen];
-                for (int readCount = 0; readCount < datalen;) {
-                    int r = stream.read(metaBytes, readCount, datalen - readCount);
-                    if(r!=-1) {
-                        readCount += r;
-
-                    } else {
-                        break;
-                    }
-                }
-
-                ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
-
-                DataInputStream dis = new DataInputStream(is);
-
-                Object data = null;
-
-                for (int i = 0; i < 2; i++) {
-                    data = readAMFData(dis, -1);
-                }
-
-                if (data instanceof Map) {
-                    // TODO if there are multiple metadata values with same key (in
-                    // separate AMF blocks, we currently loose previous values)
-                    Map<String, Object> extractedMetadata = (Map<String, Object>) data;
-                    for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
-                        if (entry.getValue() == null) {
-                            continue;
-                        }
-                        metadata.set(entry.getKey(), entry.getValue().toString());
-                    }
-                }
-
-            } else {
-                // Tag was not metadata, skip over data we cannot handle
-                for (int i = 0; i < datalen; i++) {
-                    datainput.readByte();
-                }
-            }
-
-            sizePrev = readUInt32(datainput); // previous block size
-            if (sizePrev != datalen + 11) {
-                // file was corrupt or we could not parse it...
-                break;
-            }
-        }
-
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * <p>
+ * Parser for metadata contained in Flash Videos (.flv). Resources:
+ * http://osflash.org/flv and for AMF:
+ * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
+ * <p>
+ * This parser is capable of extracting the general metadata from header as well
+ * as embedded metadata.
+ * <p>
+ * Known keys for metadata (from file header):
+ * <ol>
+ * <li>hasVideo: true|false
+ * <li>hasSound: true|false
+ * </ol>
+ * <p>
+ * In addition to the above values also metadata that is inserted in to the
+ * actual stream will be picked. Usually there are keys like:
+ * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
+ * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
+ * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
+ * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
+ * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
+ */
+public class FLVParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -8718013155719197679L;
+
+    private static int TYPE_METADATA = 0x12;
+    private static byte MASK_AUDIO = 1;
+    private static byte MASK_VIDEO = 4;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.video("x-flv"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    private long readUInt32(DataInputStream input) throws IOException {
+        return input.readInt() & 0xFFFFFFFFL;
+    }
+
+    private int readUInt24(DataInputStream input) throws IOException {
+        int uint = input.read()<<16;
+        uint += input.read()<<8;
+        uint += input.read(); 
+        return uint;
+    }
+
+    private Object readAMFData(DataInputStream input, int type)
+            throws IOException {
+        if (type == -1) {
+            type = input.readUnsignedByte();
+        }
+        switch (type) {
+        case 0:
+            return input.readDouble();
+        case 1:
+            return input.readUnsignedByte() == 1;
+        case 2:
+            return readAMFString(input);
+        case 3:
+            return readAMFObject(input);
+        case 8:
+            return readAMFEcmaArray(input);
+        case 10:
+            return readAMFStrictArray(input);
+        case 11:
+            final Date date = new Date((long) input.readDouble());
+            input.readShort(); // time zone
+            return date;
+        case 13:
+            return "UNDEFINED";
+        default:
+            return null;
+        }
+    }
+
+    private Object readAMFStrictArray(DataInputStream input) throws IOException {
+        long count = readUInt32(input);
+        ArrayList<Object> list = new ArrayList<Object>();
+        for (int i = 0; i < count; i++) {
+            list.add(readAMFData(input, -1));
+        }
+        return list;
+    }
+
+
+    private String readAMFString(DataInputStream input) throws IOException {
+        int size = input.readUnsignedShort();
+        byte[] chars = new byte[size];
+        input.readFully(chars);
+        return new String(chars, UTF_8);
+    }
+
+    private Object readAMFObject(DataInputStream input) throws IOException {
+        HashMap<String, Object> array = new HashMap<String, Object>();
+        while (true) {
+            String key = readAMFString(input);
+            int dataType = input.read();
+            if (dataType == 9) { // object end marker
+                break;
+            }
+            array.put(key, readAMFData(input, dataType));
+        }
+        return array;
+    }
+
+    private Object readAMFEcmaArray(DataInputStream input) throws IOException {
+        long size = readUInt32(input);
+        HashMap<String, Object> array = new HashMap<String, Object>();
+        for (int i = 0; i < size; i++) {
+            String key = readAMFString(input);
+            int dataType = input.read();
+            array.put(key, readAMFData(input, dataType));
+        }
+        return array;
+    }
+
+    private boolean checkSignature(DataInputStream fis) throws IOException {
+        return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        DataInputStream datainput = new DataInputStream(stream);
+        if (!checkSignature(datainput)) {
+            throw new TikaException("FLV signature not detected");
+        }
+
+        // header
+        int version = datainput.readUnsignedByte();
+        if (version != 1) {
+            // should be 1, perhaps this is not flv?
+            throw new TikaException("Unpexpected FLV version: " + version);
+        }
+
+        int typeFlags = datainput.readUnsignedByte();
+
+        long len = readUInt32(datainput);
+        if (len != 9) {
+            // we only know about format with header of 9 bytes
+            throw new TikaException("Unpexpected FLV header length: " + len);
+        }
+
+        long sizePrev = readUInt32(datainput);
+        if (sizePrev != 0) {
+            // should be 0, perhaps this is not flv?
+            throw new TikaException(
+                    "Unpexpected FLV first previous block size: " + sizePrev);
+        }
+
+        metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
+        metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
+        metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // flv tag stream follows...
+        while (true) {
+            int type = datainput.read();
+            if (type == -1) {
+                // EOF
+                break;
+            }
+
+            int datalen = readUInt24(datainput); //body length
+            readUInt32(datainput); // timestamp
+            readUInt24(datainput); // streamid
+
+            if (type == TYPE_METADATA) {
+                // found metadata Tag, read content to buffer
+                byte[] metaBytes = new byte[datalen];
+                for (int readCount = 0; readCount < datalen;) {
+                    int r = stream.read(metaBytes, readCount, datalen - readCount);
+                    if(r!=-1) {
+                        readCount += r;
+
+                    } else {
+                        break;
+                    }
+                }
+
+                ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
+
+                DataInputStream dis = new DataInputStream(is);
+
+                Object data = null;
+
+                for (int i = 0; i < 2; i++) {
+                    data = readAMFData(dis, -1);
+                }
+
+                if (data instanceof Map) {
+                    // TODO if there are multiple metadata values with same key (in
+                    // separate AMF blocks, we currently loose previous values)
+                    Map<String, Object> extractedMetadata = (Map<String, Object>) data;
+                    for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
+                        if (entry.getValue() == null) {
+                            continue;
+                        }
+                        metadata.set(entry.getKey(), entry.getValue().toString());
+                    }
+                }
+
+            } else {
+                // Tag was not metadata, skip over data we cannot handle
+                for (int i = 0; i < datalen; i++) {
+                    datainput.readByte();
+                }
+            }
+
+            sizePrev = readUInt32(datainput); // previous block size
+            if (sizePrev != datalen + 11) {
+                // file was corrupt or we could not parse it...
+                break;
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 904e536..cb2151c 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -1,21 +1,21 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+tesseractPath=
+language=eng
+pageSegMode=1
+maxFileSizeToOcr=2147483647
+minFileSizeToOcr=0
 timeout=120
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
index ae30df3..d35de32 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
@@ -1,75 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class AudioParserTest {
-
-    @Test
-    public void testWAV() throws Exception {
-        String path = "/test-documents/testWAV.wav";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                AudioParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("44100.0", metadata.get("samplerate"));
-        assertEquals("2", metadata.get("channels"));
-        assertEquals("16", metadata.get("bits"));
-        assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
-        assertEquals("", content);
-    }
-
-    @Test
-    public void testAIFF() throws Exception {
-        String path = "/test-documents/testAIFF.aif";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                AudioParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("44100.0", metadata.get("samplerate"));
-        assertEquals("2", metadata.get("channels"));
-        assertEquals("16", metadata.get("bits"));
-        assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
-        assertEquals("", content);
-    }
-
-    @Test
-    public void testAU() throws Exception {
-        String path = "/test-documents/testAU.au";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                AudioParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("44100.0", metadata.get("samplerate"));
-        assertEquals("2", metadata.get("channels"));
-        assertEquals("16", metadata.get("bits"));
-        assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
-        assertEquals("", content);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class AudioParserTest {
+
+    @Test
+    public void testWAV() throws Exception {
+        String path = "/test-documents/testWAV.wav";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+    @Test
+    public void testAIFF() throws Exception {
+        String path = "/test-documents/testAIFF.aif";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+    @Test
+    public void testAU() throws Exception {
+        String path = "/test-documents/testAU.au";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
index 9336444..344f2d7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
@@ -1,42 +1,42 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.tika.TikaTest.assertContains;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class MidiParserTest {
-
-    @Test
-    public void testMID() throws Exception {
-        String path = "/test-documents/testMID.mid";
-        Metadata metadata = new Metadata();
-        String content = new Tika().parseToString(
-                MidiParserTest.class.getResourceAsStream(path), metadata);
-
-        assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("2", metadata.get("tracks"));
-        assertEquals("0", metadata.get("patches"));
-        assertEquals("PPQ", metadata.get("divisionType"));
-
-        assertContains("Untitled", content);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class MidiParserTest {
+
+    @Test
+    public void testMID() throws Exception {
+        String path = "/test-documents/testMID.mid";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                MidiParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2", metadata.get("tracks"));
+        assertEquals("0", metadata.get("patches"));
+        assertEquals("PPQ", metadata.get("divisionType"));
+
+        assertContains("Untitled", content);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
index 51f99db..fdac337 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
@@ -1,139 +1,139 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import java.util.Arrays;
-import java.util.GregorianCalendar;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
-
-import com.drew.metadata.Directory;
-import com.drew.metadata.MetadataException;
-import com.drew.metadata.Tag;
-import com.drew.metadata.exif.ExifIFD0Directory;
-import com.drew.metadata.exif.ExifSubIFDDirectory;
-import com.drew.metadata.jpeg.JpegCommentDirectory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class ImageMetadataExtractorTest {
-
-    @SuppressWarnings({"rawtypes", "unchecked"})
-    @Test
-    public void testHandleDirectories() throws MetadataException {
-        Metadata metadata = mock(Metadata.class);
-        ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
-        ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
-
-        Directory directory = new JpegCommentDirectory();
-        Iterator directories = mock(Iterator.class);
-        when(directories.hasNext()).thenReturn(true, false);
-        when(directories.next()).thenReturn(directory);
-        when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
-
-        e.handle(directories);
-        verify(handler1).supports(JpegCommentDirectory.class);
-        verify(handler1).handle(directory, metadata);
-    }
-
-    @Test
-    public void testExifHandlerSupports() {
-        assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
-        assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
-        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
-        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
-    }
-
-    @Test
-    public void testExifHandlerParseDate() throws MetadataException {
-        ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
-        when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
-        GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
-        calendar.setTimeInMillis(0);
-        calendar.set(2000, 0, 1, 0, 0, 0);
-        when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
-                calendar.getTime()); // jvm default timezone as in Metadata Extractor
-        Metadata metadata = new Metadata();
-
-        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
-        assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
-                metadata.get(TikaCoreProperties.CREATED));
-    }
-
-    @Test
-    public void testExifHandlerParseDateFallback() throws MetadataException {
-        ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
-        when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
-        GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
-        calendar.setTimeInMillis(0);
-        calendar.set(1999, 0, 1, 0, 0, 0);
-        when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
-                calendar.getTime()); // jvm default timezone as in Metadata Extractor
-        Metadata metadata = new Metadata();
-
-        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
-        assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
-                metadata.get(TikaCoreProperties.CREATED));
-    }
-
-    @Test
-    public void testExifHandlerParseDateError() throws MetadataException {
-        ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
-        when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
-        when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
-        Metadata metadata = new Metadata();
-
-        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
-        assertEquals("Parsing should proceed without date", null,
-                metadata.get(TikaCoreProperties.CREATED));
-    }
-
-    @Test
-    public void testCopyUnknownFieldsHandler() throws MetadataException {
-        Directory d = mock(Directory.class);
-        Tag t1 = mock(Tag.class);
-        when(t1.getTagName()).thenReturn("Image Description");
-        when(t1.getDescription()).thenReturn("t1");
-        Tag t2 = mock(Tag.class);
-        when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
-        when(t2.getDescription()).thenReturn("known");
-        Tag t3 = mock(Tag.class);
-        when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
-        when(t3.getDescription()).thenReturn("known");
-        List<Tag> tags = Arrays.asList(t1, t2, t3);
-        when(d.getTags()).thenReturn(tags);
-        Metadata metadata = new Metadata();
-        new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
-        assertEquals("t1", metadata.get("Image Description"));
-        assertNull("keywords should be excluded from bulk copy because it is a defined field",
-                metadata.get(Metadata.KEYWORDS));
-        assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ImageMetadataExtractorTest {
+
+    @SuppressWarnings({"rawtypes", "unchecked"})
+    @Test
+    public void testHandleDirectories() throws MetadataException {
+        Metadata metadata = mock(Metadata.class);
+        ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
+        ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
+
+        Directory directory = new JpegCommentDirectory();
+        Iterator directories = mock(Iterator.class);
+        when(directories.hasNext()).thenReturn(true, false);
+        when(directories.next()).thenReturn(directory);
+        when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
+
+        e.handle(directories);
+        verify(handler1).supports(JpegCommentDirectory.class);
+        verify(handler1).handle(directory, metadata);
+    }
+
+    @Test
+    public void testExifHandlerSupports() {
+        assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
+        assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
+        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
+        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
+    }
+
+    @Test
+    public void testExifHandlerParseDate() throws MetadataException {
+        ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
+        when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+        GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+        calendar.setTimeInMillis(0);
+        calendar.set(2000, 0, 1, 0, 0, 0);
+        when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
+                calendar.getTime()); // jvm default timezone as in Metadata Extractor
+        Metadata metadata = new Metadata();
+
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
+                metadata.get(TikaCoreProperties.CREATED));
+    }
+
+    @Test
+    public void testExifHandlerParseDateFallback() throws MetadataException {
+        ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+        when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
+        GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+        calendar.setTimeInMillis(0);
+        calendar.set(1999, 0, 1, 0, 0, 0);
+        when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
+                calendar.getTime()); // jvm default timezone as in Metadata Extractor
+        Metadata metadata = new Metadata();
+
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
+                metadata.get(TikaCoreProperties.CREATED));
+    }
+
+    @Test
+    public void testExifHandlerParseDateError() throws MetadataException {
+        ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+        when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+        when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
+        Metadata metadata = new Metadata();
+
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Parsing should proceed without date", null,
+                metadata.get(TikaCoreProperties.CREATED));
+    }
+
+    @Test
+    public void testCopyUnknownFieldsHandler() throws MetadataException {
+        Directory d = mock(Directory.class);
+        Tag t1 = mock(Tag.class);
+        when(t1.getTagName()).thenReturn("Image Description");
+        when(t1.getDescription()).thenReturn("t1");
+        Tag t2 = mock(Tag.class);
+        when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
+        when(t2.getDescription()).thenReturn("known");
+        Tag t3 = mock(Tag.class);
+        when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
+        when(t3.getDescription()).thenReturn("known");
+        List<Tag> tags = Arrays.asList(t1, t2, t3);
+        when(d.getTags()).thenReturn(tags);
+        Metadata metadata = new Metadata();
+        new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
+        assertEquals("t1", metadata.get("Image Description"));
+        assertNull("keywords should be excluded from bulk copy because it is a defined field",
+                metadata.get(Metadata.KEYWORDS));
+        assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+    }
+
+}

[10/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 01dd436..30f9c98 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -1,192 +1,192 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.tika.Tika;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends AbstractPkgTest {
-
-    @Test
-    public void testZipParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.zip")) {
-            parser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
-        String content = handler.toString();
-        assertContains("testEXCEL.xls", content);
-        assertContains("testHTML.html", content);
-        assertContains("testOpenOffice2.odt", content);
-        assertContains("testPDF.pdf", content);
-        assertContains("testPPT.ppt", content);
-        assertContains("testRTF.rtf", content);
-        assertContains("testTXT.txt", content);
-        assertContains("testWORD.doc", content);
-        assertContains("testXML.xml", content);
-    }
-
-    /**
-     * Tests that the ParseContext parser is correctly
-     *  fired for all the embedded entries.
-     */
-    @Test
-    public void testEmbedded() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.zip")) {
-            parser.parse(stream, handler, metadata, trackingContext);
-        }
-       
-       // Should have found all 9 documents
-       assertEquals(9, tracker.filenames.size());
-       assertEquals(9, tracker.mediatypes.size());
-       assertEquals(9, tracker.modifiedAts.size());
-       
-       // Should have names and modified dates, but not content types, 
-       //  as zip doesn't store the content types
-       assertEquals("testEXCEL.xls", tracker.filenames.get(0));
-       assertEquals("testHTML.html", tracker.filenames.get(1));
-       assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
-       assertEquals("testPDF.pdf", tracker.filenames.get(3));
-       assertEquals("testPPT.ppt", tracker.filenames.get(4));
-       assertEquals("testRTF.rtf", tracker.filenames.get(5));
-       assertEquals("testTXT.txt", tracker.filenames.get(6));
-       assertEquals("testWORD.doc", tracker.filenames.get(7));
-       assertEquals("testXML.xml", tracker.filenames.get(8));
-       
-       for(String type : tracker.mediatypes) {
-          assertNull(type);
-       }
-       for(String crt : tracker.createdAts) {
-           assertNull(crt);
-       }
-       for(String mod : tracker.modifiedAts) {
-           assertNotNull(mod);
-           assertTrue("Modified at " + mod, mod.startsWith("20"));
-       }
-    }
-
-    /**
-     * Test case for the ability of the ZIP parser to extract the name of
-     * a ZIP entry even if the content of the entry is unreadable due to an
-     * unsupported compression method.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
-     */
-    @Test
-    public void testUnsupportedZipCompressionMethod() throws Exception {
-        String content = new Tika().parseToString(
-                ZipParserTest.class.getResourceAsStream(
-                        "/test-documents/moby.zip"));
-        assertContains("README", content);
-    }
-
-    private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
-        public Set<String> allRelIDs = new HashSet<String>();
-        public boolean shouldParseEmbedded(Metadata metadata) {      
-            String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
-            if (relID != null) {
-                allRelIDs.add(relID);
-            }
-            return false;
-        }
-
-        public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
-            throw new UnsupportedOperationException("should never be called");
-        }
-    }
-
-    // TIKA-1036
-    @Test
-    public void testPlaceholders() throws Exception {
-        String xml = getXML("testEmbedded.zip").xml;
-        assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
-        assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
-
-        // Also make sure EMBEDDED_RELATIONSHIP_ID was
-        // passed when parsing the embedded docs:
-        Parser parser = new AutoDetectParser();
-        ParseContext context = new ParseContext();
-        context.set(Parser.class, parser);
-        GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
-        context.set(EmbeddedDocumentExtractor.class, relIDs);
-        try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
-            parser.parse(input,
-                    new BodyContentHandler(),
-                    new Metadata(),
-                    context);
-        }
-
-        assertTrue(relIDs.allRelIDs.contains("test1.txt"));
-        assertTrue(relIDs.allRelIDs.contains("test2.txt"));
-    }
-
-    @Test // TIKA-936
-    public void testCustomEncoding() throws Exception {
-        ArchiveStreamFactory factory = new ArchiveStreamFactory();
-        factory.setEntryEncoding("SJIS");
-        trackingContext.set(ArchiveStreamFactory.class, factory);
-
-        try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
-                "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
-                        + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
-                        + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
-                        + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
-            autoDetectParser.parse(
-                    stream, new DefaultHandler(),
-                    new Metadata(), trackingContext);
-        }
-
-        assertEquals(1, tracker.filenames.size());
-        assertEquals(
-                "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
-                tracker.filenames.get(0));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.tika.Tika;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Test case for parsing zip files.
+ */
+public class ZipParserTest extends AbstractPkgTest {
+
+    @Test
+    public void testZipParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.zip")) {
+            parser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        assertContains("testEXCEL.xls", content);
+        assertContains("testHTML.html", content);
+        assertContains("testOpenOffice2.odt", content);
+        assertContains("testPDF.pdf", content);
+        assertContains("testPPT.ppt", content);
+        assertContains("testRTF.rtf", content);
+        assertContains("testTXT.txt", content);
+        assertContains("testWORD.doc", content);
+        assertContains("testXML.xml", content);
+    }
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.zip")) {
+            parser.parse(stream, handler, metadata, trackingContext);
+        }
+       
+       // Should have found all 9 documents
+       assertEquals(9, tracker.filenames.size());
+       assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
+       
+       // Should have names and modified dates, but not content types, 
+       //  as zip doesn't store the content types
+       assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+       assertEquals("testHTML.html", tracker.filenames.get(1));
+       assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+       assertEquals("testPDF.pdf", tracker.filenames.get(3));
+       assertEquals("testPPT.ppt", tracker.filenames.get(4));
+       assertEquals("testRTF.rtf", tracker.filenames.get(5));
+       assertEquals("testTXT.txt", tracker.filenames.get(6));
+       assertEquals("testWORD.doc", tracker.filenames.get(7));
+       assertEquals("testXML.xml", tracker.filenames.get(8));
+       
+       for(String type : tracker.mediatypes) {
+          assertNull(type);
+       }
+       for(String crt : tracker.createdAts) {
+           assertNull(crt);
+       }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
+    }
+
+    /**
+     * Test case for the ability of the ZIP parser to extract the name of
+     * a ZIP entry even if the content of the entry is unreadable due to an
+     * unsupported compression method.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
+     */
+    @Test
+    public void testUnsupportedZipCompressionMethod() throws Exception {
+        String content = new Tika().parseToString(
+                ZipParserTest.class.getResourceAsStream(
+                        "/test-documents/moby.zip"));
+        assertContains("README", content);
+    }
+
+    private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
+        public Set<String> allRelIDs = new HashSet<String>();
+        public boolean shouldParseEmbedded(Metadata metadata) {      
+            String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+            if (relID != null) {
+                allRelIDs.add(relID);
+            }
+            return false;
+        }
+
+        public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
+            throw new UnsupportedOperationException("should never be called");
+        }
+    }
+
+    // TIKA-1036
+    @Test
+    public void testPlaceholders() throws Exception {
+        String xml = getXML("testEmbedded.zip").xml;
+        assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+        assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
+
+        // Also make sure EMBEDDED_RELATIONSHIP_ID was
+        // passed when parsing the embedded docs:
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+        GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
+        context.set(EmbeddedDocumentExtractor.class, relIDs);
+        try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
+            parser.parse(input,
+                    new BodyContentHandler(),
+                    new Metadata(),
+                    context);
+        }
+
+        assertTrue(relIDs.allRelIDs.contains("test1.txt"));
+        assertTrue(relIDs.allRelIDs.contains("test2.txt"));
+    }
+
+    @Test // TIKA-936
+    public void testCustomEncoding() throws Exception {
+        ArchiveStreamFactory factory = new ArchiveStreamFactory();
+        factory.setEntryEncoding("SJIS");
+        trackingContext.set(ArchiveStreamFactory.class, factory);
+
+        try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
+                "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
+                        + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
+                        + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
+                        + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
+            autoDetectParser.parse(
+                    stream, new DefaultHandler(),
+                    new Metadata(), trackingContext);
+        }
+
+        assertEquals(1, tracker.filenames.size());
+        assertEquals(
+                "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
+                tracker.filenames.get(0));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index 11f259e..568303c 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -1,126 +1,126 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-pdf-module</artifactId>
-  <name>Apache Tika parser pdf module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <commons.logging.version>1.1.3</commons.logging.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-multimedia-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-xmp-commons</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>pdfbox</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>pdfbox-tools</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>jempbox</artifactId>
-      <version>${jempbox.version}</version>
-    </dependency>
-    <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
-         as optional, but we prefer to have them always to avoid
-         problems with encrypted PDFs. -->
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcmail-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcprov-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>${commons.logging.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-office-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <!-- Copied from PDFBox:
-       For legal reasons (incompatible license), jai-imageio-core is to be used
-       only in the tests and may not be distributed. See also LEGAL-195 -->
-    <dependency>
-      <groupId>com.github.jai-imageio</groupId>
-      <artifactId>jai-imageio-core</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-pdf-module</artifactId>
+  <name>Apache Tika parser pdf module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <commons.logging.version>1.1.3</commons.logging.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-multimedia-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-xmp-commons</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>pdfbox</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>pdfbox-tools</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>jempbox</artifactId>
+      <version>${jempbox.version}</version>
+    </dependency>
+    <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+         as optional, but we prefer to have them always to avoid
+         problems with encrypted PDFs. -->
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcmail-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcprov-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>${commons.logging.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-office-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <!-- Copied from PDFBox:
+       For legal reasons (incompatible license), jai-imageio-core is to be used
+       only in the tests and may not be distributed. See also LEGAL-195 -->
+    <dependency>
+      <groupId>com.github.jai-imageio</groupId>
+      <artifactId>jai-imageio-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
index 9860934..d38a96d 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pdf.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pdf.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/pom.xml b/tika-parser-modules/tika-parser-scientific-module/pom.xml
index 7afe2d6..1b3eb96 100644
--- a/tika-parser-modules/tika-parser-scientific-module/pom.xml
+++ b/tika-parser-modules/tika-parser-scientific-module/pom.xml
@@ -1,136 +1,136 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-scientific-module</artifactId>
-  <name>Apache Tika parser scientific module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <netcdf-java.version>4.5.5</netcdf-java.version>
-    <sis.version>0.6</sis.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-exec</artifactId>
-      <version>${commons.exec}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.googlecode.json-simple</groupId>
-      <artifactId>json-simple</artifactId>
-      <version>1.1.1</version>
-      <exclusions>
-        <exclusion>
-          <groupId>junit</groupId>
-          <artifactId>junit</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.sis.core</groupId>
-      <artifactId>sis-utility</artifactId>
-      <version>${sis.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.sis.storage</groupId>
-      <artifactId>sis-netcdf</artifactId>
-      <version>${sis.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.sis.core</groupId>
-      <artifactId>sis-metadata</artifactId>
-      <version>${sis.version}</version>
-    </dependency>
-    <!-- edu.ucar dependencies -->
-    <dependency>
-      <groupId>edu.ucar</groupId>
-      <artifactId>netcdf4</artifactId>
-      <version>${netcdf-java.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>edu.ucar</groupId>
-      <artifactId>grib</artifactId>
-      <version>${netcdf-java.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>edu.ucar</groupId>
-      <artifactId>cdm</artifactId>
-      <version>${netcdf-java.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>jcl-over-slf4j</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>edu.ucar</groupId>
-      <artifactId>httpservices</artifactId>
-      <version>${netcdf-java.version}</version>
-    </dependency>
-    <!-- Apache cTAKES -->
-    <dependency>
-      <groupId>org.apache.ctakes</groupId>
-      <artifactId>ctakes-core</artifactId>
-      <version>3.2.2</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <!-- Upstream parser libraries -->
-    <dependency>
-      <groupId>net.sourceforge.jmatio</groupId>
-      <artifactId>jmatio</artifactId>
-      <version>1.0</version>
-    </dependency>
-    <!-- Apache Commons CSV -->
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-csv</artifactId>
-      <version>1.0</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-scientific-module</artifactId>
+  <name>Apache Tika parser scientific module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <netcdf-java.version>4.5.5</netcdf-java.version>
+    <sis.version>0.6</sis.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-exec</artifactId>
+      <version>${commons.exec}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.json-simple</groupId>
+      <artifactId>json-simple</artifactId>
+      <version>1.1.1</version>
+      <exclusions>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-utility</artifactId>
+      <version>${sis.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.storage</groupId>
+      <artifactId>sis-netcdf</artifactId>
+      <version>${sis.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-metadata</artifactId>
+      <version>${sis.version}</version>
+    </dependency>
+    <!-- edu.ucar dependencies -->
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>netcdf4</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>grib</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>cdm</artifactId>
+      <version>${netcdf-java.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jcl-over-slf4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>httpservices</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <!-- Upstream parser libraries -->
+    <dependency>
+      <groupId>net.sourceforge.jmatio</groupId>
+      <artifactId>jmatio</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <!-- Apache Commons CSV -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
index 0195b63..741b64e 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.scientific.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.scientific.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
index 0a3121b..821493b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
@@ -1,122 +1,122 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.netcdf.NetCDFParser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import ucar.nc2.Attribute;
-import ucar.nc2.Group;
-import ucar.nc2.NetcdfFile;
-
-/**
- * 
- * Since the {@link NetCDFParser} depends on the <a
- * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
- * we are able to use it to parse HDF files as well. See <a href=
- * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
- * >this link</a> for more information.
- */
-public class HDFParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 1091208208003437549L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.application("x-hdf"));
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see
-     * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
-     * .tika.parser.ParseContext)
-     */
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see
-     * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
-     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
-     * org.apache.tika.parser.ParseContext)
-     */
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-        ByteArrayOutputStream os = new ByteArrayOutputStream();
-        IOUtils.copy(stream, os);
-
-        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name == null) {
-            name = "";
-        }
-        try {
-            NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
-            unravelStringMet(ncFile, null, metadata);
-        } catch (IOException e) {
-            throw new TikaException("HDF parse error", e);
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-    protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
-        if (group == null) {
-            group = ncFile.getRootGroup();
-        }
-
-        // get file type
-        met.set("File-Type-Description", ncFile.getFileTypeDescription());
-        // unravel its string attrs
-        for (Attribute attribute : group.getAttributes()) {
-            if (attribute.isString()) {
-                met.add(attribute.getFullName(), attribute.getStringValue());
-            } else {
-                // try and cast its value to a string
-                met.add(attribute.getFullName(), String.valueOf(attribute
-                        .getNumericValue()));
-            }
-        }
-
-        for (Group g : group.getGroups()) {
-            unravelStringMet(ncFile, g, met);
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.netcdf.NetCDFParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import ucar.nc2.Attribute;
+import ucar.nc2.Group;
+import ucar.nc2.NetcdfFile;
+
+/**
+ * 
+ * Since the {@link NetCDFParser} depends on the <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
+ * we are able to use it to parse HDF files as well. See <a href=
+ * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
+ * >this link</a> for more information.
+ */
+public class HDFParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 1091208208003437549L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("x-hdf"));
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see
+     * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
+     * .tika.parser.ParseContext)
+     */
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see
+     * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
+     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+     * org.apache.tika.parser.ParseContext)
+     */
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        IOUtils.copy(stream, os);
+
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name == null) {
+            name = "";
+        }
+        try {
+            NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+            unravelStringMet(ncFile, null, metadata);
+        } catch (IOException e) {
+            throw new TikaException("HDF parse error", e);
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+    protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
+        if (group == null) {
+            group = ncFile.getRootGroup();
+        }
+
+        // get file type
+        met.set("File-Type-Description", ncFile.getFileTypeDescription());
+        // unravel its string attrs
+        for (Attribute attribute : group.getAttributes()) {
+            if (attribute.isString()) {
+                met.add(attribute.getFullName(), attribute.getStringValue());
+            } else {
+                // try and cast its value to a string
+                met.add(attribute.getFullName(), String.valueOf(attribute
+                        .getNumericValue()));
+            }
+        }
+
+        for (Group g : group.getGroups()) {
+            unravelStringMet(ncFile, g, met);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
index 1ee4dc7..d54754b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
@@ -1,72 +1,72 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.TikaTest;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- * 
- * Test suite for the {@link HDFParser}.
- * 
- */
-public class HDFParserTest extends TikaTest {
-
-    @Test
-    public void testParseGlobalMetadata() throws Exception {
-        if(System.getProperty("java.version").startsWith("1.5")) {
-            return;
-        }
-        /*
-         * this is a publicly available HDF5 file from the MLS mission:
-         * 
-         * 
-         * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
-         * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
-         */
-
-        XMLResult r = getXML("test.he5", new HDFParser());
-        assertNotNull(r.metadata);
-        assertEquals("5", r.metadata.get("GranuleMonth"));
-    }
-
-    @Test
-    public void testHDF4() throws Exception {
-       if(System.getProperty("java.version").startsWith("1.5")) {
-          return;
-      }
-
-      /*
-       * this is a publicly available HDF4 file from the HD4 examples:
-       * 
-       * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
-       */
-      XMLResult r = getXML("test.hdf", new HDFParser());
-      assertNotNull(r.metadata);
-      assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
-      assertEquals("Ascending", r.metadata.get("Pass"));
-      assertEquals("Hierarchical Data Format, version 4",
-      r.metadata.get("File-Type-Description"));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ * 
+ * Test suite for the {@link HDFParser}.
+ * 
+ */
+public class HDFParserTest extends TikaTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        if(System.getProperty("java.version").startsWith("1.5")) {
+            return;
+        }
+        /*
+         * this is a publicly available HDF5 file from the MLS mission:
+         * 
+         * 
+         * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
+         * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
+         */
+
+        XMLResult r = getXML("test.he5", new HDFParser());
+        assertNotNull(r.metadata);
+        assertEquals("5", r.metadata.get("GranuleMonth"));
+    }
+
+    @Test
+    public void testHDF4() throws Exception {
+       if(System.getProperty("java.version").startsWith("1.5")) {
+          return;
+      }
+
+      /*
+       * this is a publicly available HDF4 file from the HD4 examples:
+       * 
+       * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
+       */
+      XMLResult r = getXML("test.hdf", new HDFParser());
+      assertNotNull(r.metadata);
+      assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
+      assertEquals("Ascending", r.metadata.get("Pass"));
+      assertEquals("Hierarchical Data Format, version 4",
+      r.metadata.get("File-Type-Description"));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
index 7d0f2e8..77a8cc8 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
@@ -1,61 +1,61 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.netcdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- * Test cases to exercise the {@link NetCDFParser}.
- */
-public class NetCDFParserTest extends TikaTest {
-
-    @Test
-    public void testParseGlobalMetadata() throws Exception {
-
-        XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
-        assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
-                "model output prepared for IPCC AR4");
-        assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
-        assertEquals(r.metadata.get(Metadata.PROJECT_ID),
-                "IPCC Fourth Assessment");
-        assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
-        assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
-        assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
-                "720 ppm stabilization experiment (SRESA1B)");
-        assertEquals(r.metadata.get("File-Type-Description"),
-                "NetCDF-3/CDM");
-
-        assertContains("long_name = \"Surface area\"", r.xml);
-        assertContains("float area(lat=128, lon=256)", r.xml);
-        assertContains("float lat(lat=128)", r.xml);
-        assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
-        assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
-        
-
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ */
+public class NetCDFParserTest extends TikaTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+
+        XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
+        assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
+                "model output prepared for IPCC AR4");
+        assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+        assertEquals(r.metadata.get(Metadata.PROJECT_ID),
+                "IPCC Fourth Assessment");
+        assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+        assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
+        assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
+                "720 ppm stabilization experiment (SRESA1B)");
+        assertEquals(r.metadata.get("File-Type-Description"),
+                "NetCDF-3/CDM");
+
+        assertContains("long_name = \"Surface area\"", r.xml);
+        assertContains("float area(lat=128, lon=256)", r.xml);
+        assertContains("float lat(lat=128)", r.xml);
+        assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
+        assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
+        
+
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/pom.xml b/tika-parser-modules/tika-parser-text-module/pom.xml
index 1389d08..aca729b 100644
--- a/tika-parser-modules/tika-parser-text-module/pom.xml
+++ b/tika-parser-modules/tika-parser-text-module/pom.xml
@@ -1,67 +1,67 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-text-module</artifactId>
-  <name>Apache Tika parser text module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <commons.logging.version>1.1.3</commons.logging.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.googlecode.juniversalchardet</groupId>
-      <artifactId>juniversalchardet</artifactId>
-      <version>1.0.3</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-        <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>${codec.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>${commons.logging.version}</version>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-text-module</artifactId>
+  <name>Apache Tika parser text module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <commons.logging.version>1.1.3</commons.logging.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.juniversalchardet</groupId>
+      <artifactId>juniversalchardet</artifactId>
+      <version>1.0.3</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+        <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>${commons.logging.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
index 80716d8..59836c6 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
@@ -1,20 +1,20 @@
-package org.apache.tika.module.text.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+package org.apache.tika.module.text.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

[19/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
index 20458bc..4c2bdfd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
@@ -1,125 +1,125 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests major functionality of ChmBlockInfo
- * 
- */
-public class TestChmBlockInfo {
-    private byte[] data;
-    private ChmBlockInfo chmBlockInfo;
-    private ChmDirectoryListingSet chmDirListCont = null;
-    private ChmLzxcResetTable clrt = null;
-    private ChmLzxcControlData chmLzxcControlData = null;
-
-    @Before
-    public void setUp() throws Exception {
-        data = TestParameters.chmData;
-        /* Creates and parses itsf header */
-        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
-        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        /* Creates and parses itsp block */
-        ChmItspHeader chmItspHeader = new ChmItspHeader();
-        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
-        // chmItsHeader.getDirOffset(),
-        // (int) chmItsHeader.getDirOffset()
-        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        chmItspHeader.parse(ChmCommons.copyOfRange(data,
-                (int) chmItsHeader.getDirOffset(),
-                (int) chmItsHeader.getDirOffset()
-                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        /* Creating instance of ChmDirListingContainer */
-        chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
-                chmItspHeader);
-        int indexOfControlData = chmDirListCont.getControlDataIndex();
-
-        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
-                ChmConstants.LZXC.getBytes(UTF_8));
-        byte[] dir_chunk = null;
-        if (indexOfResetTable > 0) {
-            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
-            // indexOfResetTable
-            // +
-            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
-            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
-                    indexOfResetTable
-                            + chmDirListCont.getDirectoryListingEntryList()
-                                    .get(indexOfControlData).getLength());
-        }
-
-        /* Creates and parses control block */
-        chmLzxcControlData = new ChmLzxcControlData();
-        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
-        int indexOfFeList = chmDirListCont.getResetTableIndex();
-        int startIndex = (int) chmDirListCont.getDataOffset()
-                + chmDirListCont.getDirectoryListingEntryList()
-                        .get(indexOfFeList).getOffset();
-        // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
-        // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
-        dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
-                        + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
-        clrt = new ChmLzxcResetTable();
-        clrt.parse(dir_chunk, clrt);
-    }
-
-    @Test
-    public void testToString() {
-        if (chmBlockInfo == null)
-            testGetChmBlockInfo();
-        assertTrue(chmBlockInfo.toString().length() > 0);
-    }
-
-    @Test
-    public void testGetChmBlockInfo() {
-        for (DirectoryListingEntry directoryListingEntry : chmDirListCont.getDirectoryListingEntryList()) {
-            chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
-                    directoryListingEntry, (int) clrt.getBlockLen(),
-                    chmLzxcControlData);
-            // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
-            // chmBlockInfo.toString() != null);
-            assertTrue(!ChmCommons.isEmpty(directoryListingEntry
-                    .getName()) && chmBlockInfo.toString() != null);
-        }
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        data = null;
-        chmBlockInfo = null;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests major functionality of ChmBlockInfo
+ * 
+ */
+public class TestChmBlockInfo {
+    private byte[] data;
+    private ChmBlockInfo chmBlockInfo;
+    private ChmDirectoryListingSet chmDirListCont = null;
+    private ChmLzxcResetTable clrt = null;
+    private ChmLzxcControlData chmLzxcControlData = null;
+
+    @Before
+    public void setUp() throws Exception {
+        data = TestParameters.chmData;
+        /* Creates and parses itsf header */
+        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        /* Creates and parses itsp block */
+        ChmItspHeader chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsHeader.getDirOffset(),
+        // (int) chmItsHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsHeader.getDirOffset(),
+                (int) chmItsHeader.getDirOffset()
+                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        /* Creating instance of ChmDirListingContainer */
+        chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
+                chmItspHeader);
+        int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+                ChmConstants.LZXC.getBytes(UTF_8));
+        byte[] dir_chunk = null;
+        if (indexOfResetTable > 0) {
+            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+            // indexOfResetTable
+            // +
+            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+                    indexOfResetTable
+                            + chmDirListCont.getDirectoryListingEntryList()
+                                    .get(indexOfControlData).getLength());
+        }
+
+        /* Creates and parses control block */
+        chmLzxcControlData = new ChmLzxcControlData();
+        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+        int indexOfFeList = chmDirListCont.getResetTableIndex();
+        int startIndex = (int) chmDirListCont.getDataOffset()
+                + chmDirListCont.getDirectoryListingEntryList()
+                        .get(indexOfFeList).getOffset();
+        // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
+        // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+        dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
+                        + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+        clrt = new ChmLzxcResetTable();
+        clrt.parse(dir_chunk, clrt);
+    }
+
+    @Test
+    public void testToString() {
+        if (chmBlockInfo == null)
+            testGetChmBlockInfo();
+        assertTrue(chmBlockInfo.toString().length() > 0);
+    }
+
+    @Test
+    public void testGetChmBlockInfo() {
+        for (DirectoryListingEntry directoryListingEntry : chmDirListCont.getDirectoryListingEntryList()) {
+            chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
+                    directoryListingEntry, (int) clrt.getBlockLen(),
+                    chmLzxcControlData);
+            // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
+            // chmBlockInfo.toString() != null);
+            assertTrue(!ChmCommons.isEmpty(directoryListingEntry
+                    .getName()) && chmBlockInfo.toString() != null);
+        }
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        data = null;
+        chmBlockInfo = null;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
index 5f53870..229277d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
@@ -1,212 +1,212 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.SAXException;
-
-public class TestChmExtraction extends TikaTest {
-
-    private final Parser parser = new ChmParser();
-
-    private final List<String> files = Arrays.asList(
-            "/test-documents/testChm.chm",
-            "/test-documents/testChm2.chm",
-            "/test-documents/testChm3.chm");
-
-    @Test
-    public void testGetText() throws Exception {
-        BodyContentHandler handler = new BodyContentHandler();
-        new ChmParser().parse(
-                new ByteArrayInputStream(TestParameters.chmData),
-                handler, new Metadata(), new ParseContext());
-        assertTrue(handler.toString().contains(
-                "The TCard method accepts only numeric arguments"));
-    }
-
-    @Test
-    public void testChmParser() throws Exception{
-        for (String fileName : files) {
-            InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName);
-            testingChm(stream);
-        }
-    }
-
-    private void testingChm(InputStream stream) throws IOException, SAXException, TikaException {
-      try {
-          BodyContentHandler handler = new BodyContentHandler(-1);
-          parser.parse(stream, handler, new Metadata(), new ParseContext());
-          assertTrue(!handler.toString().isEmpty());
-      } finally {
-          stream.close();
-      }
-    }
-
-    @Test
-    public void testExtractChmEntries() throws TikaException, IOException{
-        for (String fileName : files) {
-            try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) {
-                testExtractChmEntry(stream);
-            }
-        }
-    }
-    
-    protected boolean findZero(byte[] textData) {
-        for (byte b : textData) {
-            if (b==0) {
-                return true;
-            }
-        }
-        
-        return false;
-    }
-    
-    protected boolean niceAscFileName(String name) {
-        for (char c : name.toCharArray()) {
-            if (c>=127 || c<32) {
-                //non-ascii char or control char
-                return false;
-            }
-        }
-        
-        return true;
-    }
-    
-    protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{
-        ChmExtractor chmExtractor = new ChmExtractor(stream);
-        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
-        final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
-                , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
-        
-        Set<String> names = new HashSet<String>();
-        
-        for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
-            byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
-            
-            //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
-            if (! niceAscFileName(directoryListingEntry.getName())) {
-                throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
-            }
-            
-            final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
-            
-            //check duplicate entry name which is seen before.
-            if (names.contains(lowName)) {
-                throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
-            }
-            names.add(lowName);
-            
-            if (lowName.endsWith(".html")
-                    || lowName.endsWith(".htm")
-                    || lowName.endsWith(".hhk")
-                    || lowName.endsWith(".hhc")
-                    //|| name.endsWith(".bmp")
-                    ) {
-                if (findZero(data)) {
-                    throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
-                }
-
-                //validate html
-                String html = new String(data, ISO_8859_1);
-                if (! htmlPairP.matcher(html).find()) {
-                    System.err.println(lowName + " is invalid.");
-                    System.err.println(html);
-                    throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
-                }
-//                else {
-//                    System.err.println(directoryListingEntry.getName() + " is valid.");
-//                }
-            }
-        }
-    }
-    
-
-    @Test
-    public void testMultiThreadedChmExtraction() throws InterruptedException {
-        ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
-        for (int i = 0; i < TestParameters.NTHREADS; i++) {
-            executor.execute(new Runnable() {
-                public void run() {
-                    for (String fileName : files) {
-                        InputStream stream = null;
-                        try {
-                            stream = TestChmExtraction.class.getResourceAsStream(fileName);
-                            BodyContentHandler handler = new BodyContentHandler(-1);
-                            parser.parse(stream, handler, new Metadata(), new ParseContext());
-                            assertTrue(!handler.toString().isEmpty());
-                        } catch (Exception e) {
-                            e.printStackTrace();
-                        } finally {
-                            try {
-                                stream.close();
-                            } catch (IOException e) {
-                                e.printStackTrace();
-                            }
-                        }
-                    }
-                }
-            });
-        }
-        executor.shutdown();
-        // Waits until all threads will have finished
-        while (!executor.isTerminated()) {
-            Thread.sleep(500);
-        }
-    }
-    
-    @Test
-    public void test_TIKA_1446() throws Exception {
-        String[] chemFiles = {
-                "admin.chm",
-                "cmak_ops.CHM",
-                "comexp.CHM",
-                "gpedit.CHM",
-                "IMJPCL.CHM",
-                "IMJPCLE.CHM",
-                "IMTCEN.CHM",
-                "tcpip.CHM",
-                "wmicontrol.CHM"
-        };
-        for (String fileName : chemFiles) {
-            testingChm(getTestDocumentAsStream("chm/"+fileName));
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class TestChmExtraction extends TikaTest {
+
+    private final Parser parser = new ChmParser();
+
+    private final List<String> files = Arrays.asList(
+            "/test-documents/testChm.chm",
+            "/test-documents/testChm2.chm",
+            "/test-documents/testChm3.chm");
+
+    @Test
+    public void testGetText() throws Exception {
+        BodyContentHandler handler = new BodyContentHandler();
+        new ChmParser().parse(
+                new ByteArrayInputStream(TestParameters.chmData),
+                handler, new Metadata(), new ParseContext());
+        assertTrue(handler.toString().contains(
+                "The TCard method accepts only numeric arguments"));
+    }
+
+    @Test
+    public void testChmParser() throws Exception{
+        for (String fileName : files) {
+            InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName);
+            testingChm(stream);
+        }
+    }
+
+    private void testingChm(InputStream stream) throws IOException, SAXException, TikaException {
+      try {
+          BodyContentHandler handler = new BodyContentHandler(-1);
+          parser.parse(stream, handler, new Metadata(), new ParseContext());
+          assertTrue(!handler.toString().isEmpty());
+      } finally {
+          stream.close();
+      }
+    }
+
+    @Test
+    public void testExtractChmEntries() throws TikaException, IOException{
+        for (String fileName : files) {
+            try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) {
+                testExtractChmEntry(stream);
+            }
+        }
+    }
+    
+    protected boolean findZero(byte[] textData) {
+        for (byte b : textData) {
+            if (b==0) {
+                return true;
+            }
+        }
+        
+        return false;
+    }
+    
+    protected boolean niceAscFileName(String name) {
+        for (char c : name.toCharArray()) {
+            if (c>=127 || c<32) {
+                //non-ascii char or control char
+                return false;
+            }
+        }
+        
+        return true;
+    }
+    
+    protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{
+        ChmExtractor chmExtractor = new ChmExtractor(stream);
+        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+        final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
+                , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
+        
+        Set<String> names = new HashSet<String>();
+        
+        for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
+            byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
+            
+            //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
+            if (! niceAscFileName(directoryListingEntry.getName())) {
+                throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
+            }
+            
+            final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
+            
+            //check duplicate entry name which is seen before.
+            if (names.contains(lowName)) {
+                throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
+            }
+            names.add(lowName);
+            
+            if (lowName.endsWith(".html")
+                    || lowName.endsWith(".htm")
+                    || lowName.endsWith(".hhk")
+                    || lowName.endsWith(".hhc")
+                    //|| name.endsWith(".bmp")
+                    ) {
+                if (findZero(data)) {
+                    throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
+                }
+
+                //validate html
+                String html = new String(data, ISO_8859_1);
+                if (! htmlPairP.matcher(html).find()) {
+                    System.err.println(lowName + " is invalid.");
+                    System.err.println(html);
+                    throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
+                }
+//                else {
+//                    System.err.println(directoryListingEntry.getName() + " is valid.");
+//                }
+            }
+        }
+    }
+    
+
+    @Test
+    public void testMultiThreadedChmExtraction() throws InterruptedException {
+        ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
+        for (int i = 0; i < TestParameters.NTHREADS; i++) {
+            executor.execute(new Runnable() {
+                public void run() {
+                    for (String fileName : files) {
+                        InputStream stream = null;
+                        try {
+                            stream = TestChmExtraction.class.getResourceAsStream(fileName);
+                            BodyContentHandler handler = new BodyContentHandler(-1);
+                            parser.parse(stream, handler, new Metadata(), new ParseContext());
+                            assertTrue(!handler.toString().isEmpty());
+                        } catch (Exception e) {
+                            e.printStackTrace();
+                        } finally {
+                            try {
+                                stream.close();
+                            } catch (IOException e) {
+                                e.printStackTrace();
+                            }
+                        }
+                    }
+                }
+            });
+        }
+        executor.shutdown();
+        // Waits until all threads will have finished
+        while (!executor.isTerminated()) {
+            Thread.sleep(500);
+        }
+    }
+    
+    @Test
+    public void test_TIKA_1446() throws Exception {
+        String[] chemFiles = {
+                "admin.chm",
+                "cmak_ops.CHM",
+                "comexp.CHM",
+                "gpedit.CHM",
+                "IMJPCL.CHM",
+                "IMJPCLE.CHM",
+                "IMTCEN.CHM",
+                "tcpip.CHM",
+                "wmicontrol.CHM"
+        };
+        for (String fileName : chemFiles) {
+            testingChm(getTestDocumentAsStream("chm/"+fileName));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
index 4301240..c072db0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
@@ -1,63 +1,63 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.util.List;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmExtractor {
-    private ChmExtractor chmExtractor = null;
-
-    @Before
-    public void setUp() throws Exception {
-        chmExtractor = new ChmExtractor(
-                new ByteArrayInputStream(TestParameters.chmData));
-    }
-
-    @Test
-    public void testEnumerateChm() {
-        List<String> chmEntries = chmExtractor.enumerateChm();
-        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
-                chmEntries.size());
-    }
-
-    @Test
-    public void testGetChmDirList() {
-        assertNotNull(chmExtractor.getChmDirList());
-    }
-
-    @Test
-    public void testExtractChmEntry() throws TikaException{
-        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
-        
-        int count = 0;
-        for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
-            chmExtractor.extractChmEntry(directoryListingEntry);
-            ++count;
-        }
-        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmExtractor {
+    private ChmExtractor chmExtractor = null;
+
+    @Before
+    public void setUp() throws Exception {
+        chmExtractor = new ChmExtractor(
+                new ByteArrayInputStream(TestParameters.chmData));
+    }
+
+    @Test
+    public void testEnumerateChm() {
+        List<String> chmEntries = chmExtractor.enumerateChm();
+        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
+                chmEntries.size());
+    }
+
+    @Test
+    public void testGetChmDirList() {
+        assertNotNull(chmExtractor.getChmDirList());
+    }
+
+    @Test
+    public void testExtractChmEntry() throws TikaException{
+        ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+        
+        int count = 0;
+        for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
+            chmExtractor.extractChmEntry(directoryListingEntry);
+            ++count;
+        }
+        assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
index 6bda44a..05d3820 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
@@ -1,122 +1,122 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public functions of ChmItsfHeader
- * 
- */
-public class TestChmItsfHeader {
-    private ChmItsfHeader chmItsfHeader = null;
-
-    @Before
-    public void setUp() throws Exception {
-        chmItsfHeader = new ChmItsfHeader();
-        byte[] data = TestParameters.chmData;
-        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-    }
-
-    @Test
-    public void getDataOffset() {
-        assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
-                chmItsfHeader.getDataOffset());
-    }
-
-    @Test
-    public void getDir_uuid() {
-        assertNotNull(chmItsfHeader.getDir_uuid());
-    }
-
-    @Test
-    public void getDirLen() {
-        assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
-                chmItsfHeader.getDirLen());
-    }
-
-    @Test
-    public void getDirOffset() {
-        assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
-                chmItsfHeader.getDirOffset());
-    }
-
-    @Test
-    public void getHeaderLen() {
-        assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
-                chmItsfHeader.getHeaderLen());
-    }
-
-    @Test
-    public void getLangId() {
-        assertEquals(TestParameters.VP_LANGUAGE_ID,
-                chmItsfHeader.getLangId());
-    }
-
-    @Test
-    public void getLastModified() {
-        assertEquals(TestParameters.VP_LAST_MODIFIED,
-                chmItsfHeader.getLastModified());
-    }
-
-    @Test
-    public void getUnknown_000c() {
-        assertEquals(TestParameters.VP_UNKNOWN_000C,
-                chmItsfHeader.getUnknown_000c());
-    }
-
-    @Test
-    public void getUnknownLen() {
-        assertEquals(TestParameters.VP_UNKNOWN_LEN,
-                chmItsfHeader.getUnknownLen());
-    }
-    
-    @Test
-    public void getUnknownOffset() {
-        assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
-                chmItsfHeader.getUnknownOffset());
-    }
-
-    @Test
-    public void getVersion() {
-        assertEquals(TestParameters.VP_VERSION,
-                chmItsfHeader.getVersion());
-    }
-
-    @Test
-    public void testToString() {
-        assertTrue(chmItsfHeader.toString().contains(
-                TestParameters.VP_ISTF_SIGNATURE));
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        chmItsfHeader = null;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public functions of ChmItsfHeader
+ * 
+ */
+public class TestChmItsfHeader {
+    private ChmItsfHeader chmItsfHeader = null;
+
+    @Before
+    public void setUp() throws Exception {
+        chmItsfHeader = new ChmItsfHeader();
+        byte[] data = TestParameters.chmData;
+        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+    }
+
+    @Test
+    public void getDataOffset() {
+        assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
+                chmItsfHeader.getDataOffset());
+    }
+
+    @Test
+    public void getDir_uuid() {
+        assertNotNull(chmItsfHeader.getDir_uuid());
+    }
+
+    @Test
+    public void getDirLen() {
+        assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
+                chmItsfHeader.getDirLen());
+    }
+
+    @Test
+    public void getDirOffset() {
+        assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
+                chmItsfHeader.getDirOffset());
+    }
+
+    @Test
+    public void getHeaderLen() {
+        assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
+                chmItsfHeader.getHeaderLen());
+    }
+
+    @Test
+    public void getLangId() {
+        assertEquals(TestParameters.VP_LANGUAGE_ID,
+                chmItsfHeader.getLangId());
+    }
+
+    @Test
+    public void getLastModified() {
+        assertEquals(TestParameters.VP_LAST_MODIFIED,
+                chmItsfHeader.getLastModified());
+    }
+
+    @Test
+    public void getUnknown_000c() {
+        assertEquals(TestParameters.VP_UNKNOWN_000C,
+                chmItsfHeader.getUnknown_000c());
+    }
+
+    @Test
+    public void getUnknownLen() {
+        assertEquals(TestParameters.VP_UNKNOWN_LEN,
+                chmItsfHeader.getUnknownLen());
+    }
+    
+    @Test
+    public void getUnknownOffset() {
+        assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
+                chmItsfHeader.getUnknownOffset());
+    }
+
+    @Test
+    public void getVersion() {
+        assertEquals(TestParameters.VP_VERSION,
+                chmItsfHeader.getVersion());
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue(chmItsfHeader.toString().contains(
+                TestParameters.VP_ISTF_SIGNATURE));
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        chmItsfHeader = null;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
index 91e4ba6..e78e7c8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
@@ -1,160 +1,160 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public methods of the ChmItspHeader
- * 
- */
-public class TestChmItspHeader {
-    private ChmItspHeader chmItspHeader = null;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-
-        ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
-        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-
-        chmItspHeader = new ChmItspHeader();
-        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
-        // chmItsfHeader.getDirOffset(),
-        // (int) chmItsfHeader.getDirOffset()
-        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        chmItspHeader.parse(ChmCommons.copyOfRange(data,
-                (int) chmItsfHeader.getDirOffset(),
-                (int) chmItsfHeader.getDirOffset()
-                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-    }
-
-    @Test
-    public void testGetBlock_len() {
-        assertEquals(TestParameters.VP_BLOCK_LENGTH,
-                chmItspHeader.getBlock_len());
-    }
-
-    @Test
-    public void testGetBlockidx_intvl() {
-        assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL,
-                chmItspHeader.getBlockidx_intvl());
-    }
-
-    @Test
-    public void testGetHeader_len() {
-        assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH,
-                chmItspHeader.getHeader_len());
-    }
-
-    @Test
-    public void testGetIndex_depth() {
-        assertEquals(TestParameters.VP_INDEX_DEPTH,
-                chmItspHeader.getIndex_depth());
-    }
-
-    @Test
-    public void testGetIndex_head() {
-        assertEquals(TestParameters.VP_INDEX_HEAD,
-                chmItspHeader.getIndex_head());
-    }
-
-    @Test
-    public void testGetIndex_root() {
-        assertEquals(TestParameters.VP_INDEX_ROOT,
-                chmItspHeader.getIndex_root());
-    }
-
-    @Test
-    public void testGetLang_id() {
-        assertEquals(TestParameters.VP_LANGUAGE_ID,
-                chmItspHeader.getLang_id());
-    }
-
-    @Test
-    public void testGetNum_blocks() {
-        assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS,
-                chmItspHeader.getNum_blocks());
-    }
-
-    @Test
-    public void testGetUnknown_000c() {
-        assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C,
-                chmItspHeader.getUnknown_000c());
-    }
-
-    @Test
-    public void testGetUnknown_0024() {
-        assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024,
-                chmItspHeader.getUnknown_0024());
-    }
-
-    @Test
-    public void testGetUnknown_002() {
-        assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C,
-                chmItspHeader.getUnknown_002c());
-    }
-
-    @Test
-    public void testGetUnknown_0044() {
-        assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
-                chmItspHeader.getUnknown_0044().length);
-    }
-
-    @Test
-    public void testGetVersion() {
-        assertEquals(TestParameters.VP_ITSP_VERSION,
-                chmItspHeader.getVersion());
-    }
-
-    @Test
-    public void testGetSignature() {
-        assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String(
-                chmItspHeader.getSignature(), UTF_8));
-    }
-
-    @Test
-    public void testGetSystem_uuid() {
-        assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
-                chmItspHeader.getSystem_uuid().length);
-    }
-
-    @Test
-    public void testToString() {
-        assertTrue(chmItspHeader.toString().contains(
-                TestParameters.VP_ISTP_SIGNATURE));
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        chmItspHeader = null;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public methods of the ChmItspHeader
+ * 
+ */
+public class TestChmItspHeader {
+    private ChmItspHeader chmItspHeader = null;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+
+        ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
+        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+
+        chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsfHeader.getDirOffset(),
+        // (int) chmItsfHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsfHeader.getDirOffset(),
+                (int) chmItsfHeader.getDirOffset()
+                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+    }
+
+    @Test
+    public void testGetBlock_len() {
+        assertEquals(TestParameters.VP_BLOCK_LENGTH,
+                chmItspHeader.getBlock_len());
+    }
+
+    @Test
+    public void testGetBlockidx_intvl() {
+        assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL,
+                chmItspHeader.getBlockidx_intvl());
+    }
+
+    @Test
+    public void testGetHeader_len() {
+        assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH,
+                chmItspHeader.getHeader_len());
+    }
+
+    @Test
+    public void testGetIndex_depth() {
+        assertEquals(TestParameters.VP_INDEX_DEPTH,
+                chmItspHeader.getIndex_depth());
+    }
+
+    @Test
+    public void testGetIndex_head() {
+        assertEquals(TestParameters.VP_INDEX_HEAD,
+                chmItspHeader.getIndex_head());
+    }
+
+    @Test
+    public void testGetIndex_root() {
+        assertEquals(TestParameters.VP_INDEX_ROOT,
+                chmItspHeader.getIndex_root());
+    }
+
+    @Test
+    public void testGetLang_id() {
+        assertEquals(TestParameters.VP_LANGUAGE_ID,
+                chmItspHeader.getLang_id());
+    }
+
+    @Test
+    public void testGetNum_blocks() {
+        assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS,
+                chmItspHeader.getNum_blocks());
+    }
+
+    @Test
+    public void testGetUnknown_000c() {
+        assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C,
+                chmItspHeader.getUnknown_000c());
+    }
+
+    @Test
+    public void testGetUnknown_0024() {
+        assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024,
+                chmItspHeader.getUnknown_0024());
+    }
+
+    @Test
+    public void testGetUnknown_002() {
+        assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C,
+                chmItspHeader.getUnknown_002c());
+    }
+
+    @Test
+    public void testGetUnknown_0044() {
+        assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
+                chmItspHeader.getUnknown_0044().length);
+    }
+
+    @Test
+    public void testGetVersion() {
+        assertEquals(TestParameters.VP_ITSP_VERSION,
+                chmItspHeader.getVersion());
+    }
+
+    @Test
+    public void testGetSignature() {
+        assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String(
+                chmItspHeader.getSignature(), UTF_8));
+    }
+
+    @Test
+    public void testGetSystem_uuid() {
+        assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
+                chmItspHeader.getSystem_uuid().length);
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue(chmItspHeader.toString().contains(
+                TestParameters.VP_ISTP_SIGNATURE));
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        chmItspHeader = null;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
index d40874f..c8a8eb7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
@@ -1,101 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.lzx.ChmLzxState;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmLzxState {
-    private ChmLzxState chmLzxState;
-    private int windowSize;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-
-        /* Creates and parses itsf header */
-        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
-        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        /* Creates and parses itsp block */
-        ChmItspHeader chmItspHeader = new ChmItspHeader();
-        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
-        // chmItsHeader.getDirOffset(),
-        // (int) chmItsHeader.getDirOffset()
-        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        chmItspHeader.parse(ChmCommons.copyOfRange(data,
-                (int) chmItsHeader.getDirOffset(),
-                (int) chmItsHeader.getDirOffset()
-                + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-
-        /* Creating instance of ChmDirListingContainer */
-        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
-                data, chmItsHeader, chmItspHeader);
-        int indexOfControlData = ChmCommons.indexOf(
-                chmDirListCont.getDirectoryListingEntryList(),
-                ChmConstants.CONTROL_DATA);
-
-        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
-                ChmConstants.LZXC.getBytes(UTF_8));
-        byte[] dir_chunk = null;
-        if (indexOfResetTable > 0) {
-            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
-            // indexOfResetTable
-            // +
-            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
-            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
-                    indexOfResetTable
-                    + chmDirListCont.getDirectoryListingEntryList()
-                    .get(indexOfControlData).getLength());
-        }
-
-        ChmLzxcControlData clcd = new ChmLzxcControlData();
-        clcd.parse(dir_chunk, clcd);
-        windowSize = (int) clcd.getWindowSize();
-    }
-
-    @Test
-    public void testChmLzxStateConstructor() throws TikaException {
-        chmLzxState = new ChmLzxState(windowSize);
-        assertNotNull(chmLzxState);
-    }
-
-    @Test
-    public void testToString() throws TikaException {
-        if (chmLzxState == null)
-            testChmLzxStateConstructor();
-        assertTrue(chmLzxState.toString().length() > 20);
-    }
-
-    // TODO add more tests
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.lzx.ChmLzxState;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmLzxState {
+    private ChmLzxState chmLzxState;
+    private int windowSize;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+
+        /* Creates and parses itsf header */
+        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        /* Creates and parses itsp block */
+        ChmItspHeader chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsHeader.getDirOffset(),
+        // (int) chmItsHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsHeader.getDirOffset(),
+                (int) chmItsHeader.getDirOffset()
+                + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+
+        /* Creating instance of ChmDirListingContainer */
+        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+                data, chmItsHeader, chmItspHeader);
+        int indexOfControlData = ChmCommons.indexOf(
+                chmDirListCont.getDirectoryListingEntryList(),
+                ChmConstants.CONTROL_DATA);
+
+        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+                ChmConstants.LZXC.getBytes(UTF_8));
+        byte[] dir_chunk = null;
+        if (indexOfResetTable > 0) {
+            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+            // indexOfResetTable
+            // +
+            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+                    indexOfResetTable
+                    + chmDirListCont.getDirectoryListingEntryList()
+                    .get(indexOfControlData).getLength());
+        }
+
+        ChmLzxcControlData clcd = new ChmLzxcControlData();
+        clcd.parse(dir_chunk, clcd);
+        windowSize = (int) clcd.getWindowSize();
+    }
+
+    @Test
+    public void testChmLzxStateConstructor() throws TikaException {
+        chmLzxState = new ChmLzxState(windowSize);
+        assertNotNull(chmLzxState);
+    }
+
+    @Test
+    public void testToString() throws TikaException {
+        if (chmLzxState == null)
+            testChmLzxStateConstructor();
+        assertTrue(chmLzxState.toString().length() > 20);
+    }
+
+    // TODO add more tests
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
index 4449b70..e7992bf 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
@@ -1,144 +1,144 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public methods of ChmLzxcControlData block
- */
-public class TestChmLzxcControlData {
-    private ChmLzxcControlData chmLzxcControlData = null;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-        /* Creates and parses itsf header */
-        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
-        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
-        /* Creates and parses itsp block */
-        ChmItspHeader chmItspHeader = new ChmItspHeader();
-        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
-        // chmItsHeader.getDirOffset(),
-        // (int) chmItsHeader.getDirOffset()
-        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        chmItspHeader.parse(ChmCommons.copyOfRange(data,
-                (int) chmItsHeader.getDirOffset(),
-                (int) chmItsHeader.getDirOffset()
-                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        /* Creating instance of ChmDirListingContainer */
-        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
-                data, chmItsHeader, chmItspHeader);
-        int indexOfControlData = chmDirListCont.getControlDataIndex();
-
-        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
-                ChmConstants.LZXC.getBytes(UTF_8));
-        byte[] dir_chunk = null;
-        if (indexOfResetTable > 0) {
-            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
-            // indexOfResetTable
-            // +
-            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
-            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
-                    indexOfResetTable
-                            + chmDirListCont.getDirectoryListingEntryList()
-                                    .get(indexOfControlData).getLength());
-        }
-
-        /* Creates and parses control block */
-        chmLzxcControlData = new ChmLzxcControlData();
-        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
-    }
-
-    @Test
-    public void testConstructorNotNull() {
-        assertNotNull(chmLzxcControlData);
-    }
-
-    @Test
-    public void testGetResetInterval() {
-        assertEquals(TestParameters.VP_RESET_INTERVAL,
-                chmLzxcControlData.getResetInterval());
-    }
-
-    @Test
-    public void testGetSize() {
-        assertEquals(TestParameters.VP_CONTROL_DATA_SIZE,
-                chmLzxcControlData.getSize());
-    }
-
-    @Test
-    public void testGetUnknown_18() {
-        assertEquals(TestParameters.VP_UNKNOWN_18,
-                chmLzxcControlData.getUnknown_18());
-    }
-
-    @Test
-    public void testGetVersion() {
-        assertEquals(TestParameters.VP_CONTROL_DATA_VERSION,
-                chmLzxcControlData.getVersion());
-    }
-
-    @Test
-    public void testGetWindowSize() {
-        assertEquals(TestParameters.VP_WINDOW_SIZE,
-                chmLzxcControlData.getWindowSize());
-    }
-
-    @Test
-    public void testGetWindowsPerReset() {
-        assertEquals(TestParameters.VP_WINDOWS_PER_RESET,
-                chmLzxcControlData.getWindowsPerReset());
-    }
-
-    @Test
-    public void testGetToString() {
-        assertTrue(chmLzxcControlData.toString().contains(
-                TestParameters.VP_CONTROL_DATA_SIGNATURE));
-    }
-
-    @Test
-    public void testGetSignature() {
-        assertEquals(
-                TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
-                chmLzxcControlData.getSignature().length);
-    }
-
-    @Test
-    public void testGetSignaure() {
-        assertEquals(
-                TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
-                chmLzxcControlData.getSignature().length);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public methods of ChmLzxcControlData block
+ */
+public class TestChmLzxcControlData {
+    private ChmLzxcControlData chmLzxcControlData = null;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+        /* Creates and parses itsf header */
+        ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+        // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+        /* Creates and parses itsp block */
+        ChmItspHeader chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsHeader.getDirOffset(),
+        // (int) chmItsHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsHeader.getDirOffset(),
+                (int) chmItsHeader.getDirOffset()
+                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        /* Creating instance of ChmDirListingContainer */
+        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+                data, chmItsHeader, chmItspHeader);
+        int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+                ChmConstants.LZXC.getBytes(UTF_8));
+        byte[] dir_chunk = null;
+        if (indexOfResetTable > 0) {
+            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+            // indexOfResetTable
+            // +
+            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+                    indexOfResetTable
+                            + chmDirListCont.getDirectoryListingEntryList()
+                                    .get(indexOfControlData).getLength());
+        }
+
+        /* Creates and parses control block */
+        chmLzxcControlData = new ChmLzxcControlData();
+        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+    }
+
+    @Test
+    public void testConstructorNotNull() {
+        assertNotNull(chmLzxcControlData);
+    }
+
+    @Test
+    public void testGetResetInterval() {
+        assertEquals(TestParameters.VP_RESET_INTERVAL,
+                chmLzxcControlData.getResetInterval());
+    }
+
+    @Test
+    public void testGetSize() {
+        assertEquals(TestParameters.VP_CONTROL_DATA_SIZE,
+                chmLzxcControlData.getSize());
+    }
+
+    @Test
+    public void testGetUnknown_18() {
+        assertEquals(TestParameters.VP_UNKNOWN_18,
+                chmLzxcControlData.getUnknown_18());
+    }
+
+    @Test
+    public void testGetVersion() {
+        assertEquals(TestParameters.VP_CONTROL_DATA_VERSION,
+                chmLzxcControlData.getVersion());
+    }
+
+    @Test
+    public void testGetWindowSize() {
+        assertEquals(TestParameters.VP_WINDOW_SIZE,
+                chmLzxcControlData.getWindowSize());
+    }
+
+    @Test
+    public void testGetWindowsPerReset() {
+        assertEquals(TestParameters.VP_WINDOWS_PER_RESET,
+                chmLzxcControlData.getWindowsPerReset());
+    }
+
+    @Test
+    public void testGetToString() {
+        assertTrue(chmLzxcControlData.toString().contains(
+                TestParameters.VP_CONTROL_DATA_SIGNATURE));
+    }
+
+    @Test
+    public void testGetSignature() {
+        assertEquals(
+                TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
+                chmLzxcControlData.getSignature().length);
+    }
+
+    @Test
+    public void testGetSignaure() {
+        assertEquals(
+                TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
+                chmLzxcControlData.getSignature().length);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
index d84f702..79c2804 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmLzxcResetTable {
-    private ChmLzxcResetTable chmLzxcResetTable = null;
-
-    @Before
-    public void setUp() throws Exception {
-        byte[] data = TestParameters.chmData;
-        /* Creates and parses itsf header */
-        ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
-        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
-        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
-                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-        /* Creates and parses itsp block */
-        ChmItspHeader chmItspHeader = new ChmItspHeader();
-        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
-        // chmItsfHeader.getDirOffset(),
-        // (int) chmItsfHeader.getDirOffset()
-        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        chmItspHeader.parse(ChmCommons.copyOfRange(data,
-                (int) chmItsfHeader.getDirOffset(),
-                (int) chmItsfHeader.getDirOffset()
-                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-        /* Creating instance of ChmDirListingContainer */
-        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
-                data, chmItsfHeader, chmItspHeader);
-        int indexOfControlData = chmDirListCont.getControlDataIndex();
-
-        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
-                ChmConstants.LZXC.getBytes(UTF_8));
-        byte[] dir_chunk = null;
-        if (indexOfResetTable > 0) {
-            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
-            // indexOfResetTable
-            // +
-            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
-            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
-                    indexOfResetTable
-                            + chmDirListCont.getDirectoryListingEntryList()
-                                    .get(indexOfControlData).getLength());
-        }
-
-        /* Creates and parses control block */
-        ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
-        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
-        indexOfResetTable = chmDirListCont.getResetTableIndex();
-        chmLzxcResetTable = new ChmLzxcResetTable();
-
-        int startIndex = (int) chmDirListCont.getDataOffset()
-                + chmDirListCont.getDirectoryListingEntryList()
-                        .get(indexOfResetTable).getOffset();
-
-        ChmAssert.assertCopyingDataIndex(startIndex, data.length);
-
-        // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
-        // +
-        // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
-        dir_chunk = ChmCommons.copyOfRange(
-                data,
-                startIndex,
-                startIndex
-                        + chmDirListCont.getDirectoryListingEntryList()
-                                .get(indexOfResetTable).getLength());
-
-        chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
-    }
-
-    @Test
-    public void testGetBlockAddress() {
-        assertEquals(TestParameters.VP_RESET_TABLE_BA,
-                chmLzxcResetTable.getBlockAddress().length);
-    }
-
-    @Test
-    public void testGetBlockCount() {
-        assertEquals(TestParameters.VP_RESET_TABLE_BA,
-                chmLzxcResetTable.getBlockCount());
-    }
-
-    @Test
-    public void testGetBlockLen() {
-        assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH,
-                chmLzxcResetTable.getBlockLen());
-    }
-
-    @Test
-    public void testGetCompressedLen() {
-        assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH,
-                chmLzxcResetTable.getCompressedLen());
-    }
-
-    @Test
-    public void testGetTableOffset() {
-        assertEquals(TestParameters.VP_TBL_OFFSET,
-                chmLzxcResetTable.getTableOffset());
-    }
-
-    @Test
-    public void testGetUncompressedLen() {
-        assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH,
-                chmLzxcResetTable.getUncompressedLen());
-    }
-
-    @Test
-    public void testGetUnknown() {
-        assertEquals(TestParameters.VP_RES_TBL_UNKNOWN,
-                chmLzxcResetTable.getUnknown());
-    }
-
-    @Test
-    public void testGetVersion() {
-        assertEquals(TestParameters.VP_RES_TBL_VERSION,
-                chmLzxcResetTable.getVersion());
-    }
-
-    @Test
-    public void testToString() {
-        assertTrue(chmLzxcResetTable.toString().length() > 0);
-    }
-
-    // TODO: add setters to be tested
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmLzxcResetTable {
+    private ChmLzxcResetTable chmLzxcResetTable = null;
+
+    @Before
+    public void setUp() throws Exception {
+        byte[] data = TestParameters.chmData;
+        /* Creates and parses itsf header */
+        ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
+        // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+        // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+        chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+                ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+        /* Creates and parses itsp block */
+        ChmItspHeader chmItspHeader = new ChmItspHeader();
+        // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+        // chmItsfHeader.getDirOffset(),
+        // (int) chmItsfHeader.getDirOffset()
+        // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        chmItspHeader.parse(ChmCommons.copyOfRange(data,
+                (int) chmItsfHeader.getDirOffset(),
+                (int) chmItsfHeader.getDirOffset()
+                        + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+        /* Creating instance of ChmDirListingContainer */
+        ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+                data, chmItsfHeader, chmItspHeader);
+        int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+        int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+                ChmConstants.LZXC.getBytes(UTF_8));
+        byte[] dir_chunk = null;
+        if (indexOfResetTable > 0) {
+            // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+            // indexOfResetTable
+            // +
+            // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+            dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+                    indexOfResetTable
+                            + chmDirListCont.getDirectoryListingEntryList()
+                                    .get(indexOfControlData).getLength());
+        }
+
+        /* Creates and parses control block */
+        ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
+        chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+        indexOfResetTable = chmDirListCont.getResetTableIndex();
+        chmLzxcResetTable = new ChmLzxcResetTable();
+
+        int startIndex = (int) chmDirListCont.getDataOffset()
+                + chmDirListCont.getDirectoryListingEntryList()
+                        .get(indexOfResetTable).getOffset();
+
+        ChmAssert.assertCopyingDataIndex(startIndex, data.length);
+
+        // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
+        // +
+        // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+        dir_chunk = ChmCommons.copyOfRange(
+                data,
+                startIndex,
+                startIndex
+                        + chmDirListCont.getDirectoryListingEntryList()
+                                .get(indexOfResetTable).getLength());
+
+        chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
+    }
+
+    @Test
+    public void testGetBlockAddress() {
+        assertEquals(TestParameters.VP_RESET_TABLE_BA,
+                chmLzxcResetTable.getBlockAddress().length);
+    }
+
+    @Test
+    public void testGetBlockCount() {
+        assertEquals(TestParameters.VP_RESET_TABLE_BA,
+                chmLzxcResetTable.getBlockCount());
+    }
+
+    @Test
+    public void testGetBlockLen() {
+        assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH,
+                chmLzxcResetTable.getBlockLen());
+    }
+
+    @Test
+    public void testGetCompressedLen() {
+        assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH,
+                chmLzxcResetTable.getCompressedLen());
+    }
+
+    @Test
+    public void testGetTableOffset() {
+        assertEquals(TestParameters.VP_TBL_OFFSET,
+                chmLzxcResetTable.getTableOffset());
+    }
+
+    @Test
+    public void testGetUncompressedLen() {
+        assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH,
+                chmLzxcResetTable.getUncompressedLen());
+    }
+
+    @Test
+    public void testGetUnknown() {
+        assertEquals(TestParameters.VP_RES_TBL_UNKNOWN,
+                chmLzxcResetTable.getUnknown());
+    }
+
+    @Test
+    public void testGetVersion() {
+        assertEquals(TestParameters.VP_RES_TBL_VERSION,
+                chmLzxcResetTable.getVersion());
+    }
+
+    @Test
+    public void testToString() {
+        assertTrue(chmLzxcResetTable.toString().length() > 0);
+    }
+
+    // TODO: add setters to be tested
+}

[14/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index d80842b..1b692bf 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -1,510 +1,510 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.RTFMetadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Junit test class for the Tika {@link RTFParser}
- */
-public class RTFParserTest extends TikaTest {
-
-    private Tika tika = new Tika();
-
-    @Test
-    public void testBasicExtraction() throws Exception {
-
-        XMLResult r = getXML("testRTF.rtf");
-        assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
-        assertContains("Test", r.xml);
-        assertContains("indexation Word", r.xml);
-    }
-
-    @Test
-    public void testUmlautSpacesExtraction2() throws Exception {
-        assertContains("<p>\u00DCbersicht</p>",
-                getXML("testRTFUmlautSpaces2.rtf").xml);
-    }
-
-    @Test
-    public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
-        XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
-
-        assertContains("\u5E74", r.xml);
-        assertContains("\u5ff5", r.xml);
-        assertContains("0 ", r.xml);
-        assertContains("abc", r.xml);
-        assertNotContained("\u5E74\u5E74", r.xml);
-    }
-
-    @Test
-    public void testHexEscapeInsideWord() throws Exception {
-        XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
-        assertContains("ESP\u00cdRITO", r.xml);
-    }
-
-    @Test
-    public void testWindowsCodepage1250() throws Exception {
-        XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
-        assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
-        assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
-    }
-
-    @Test
-    public void testTableCellSeparation() throws Exception {
-        String content = getXML("testRTFTableCellSeparation.rtf").xml;
-        content = content.replaceAll("(\\s|<\\/?p>)+", " ");
-        assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
-    }
-
-    @Test
-    public void testTableCellSeparation2() throws Exception {
-        String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
-        // TODO: why do we insert extra whitespace...?
-        assertContains("Station</p> <p>Fax", content);
-    }
-
-    @Test
-    public void testWordPadCzechCharactersExtraction() throws Exception {
-        XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
-        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
-        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
-    }
-
-    @Test
-    public void testWord2010CzechCharactersExtraction() throws Exception {
-        XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
-        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
-        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
-    }
-
-    @Test
-    public void testMS932Extraction() throws Exception {
-        XMLResult r = getXML("testRTF-ms932.rtf");
-        // Hello in Japanese
-        assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
-
-        // Verify title, since it was also encoded with MS932:
-        r = getXML("testRTF-ms932.rtf");
-        assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
-    }
-
-    @Test
-    public void testUmlautSpacesExtraction() throws Exception {
-        XMLResult r = getXML("testRTFUmlautSpaces.rtf");
-        assertContains("\u00DCbersicht", r.xml);
-    }
-
-    @Test
-    public void testGothic() throws Exception {
-        XMLResult r = getXML("testRTFUnicodeGothic.rtf");
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
-    }
-
-    @Test
-    public void testJapaneseText() throws Exception {
-        XMLResult r = getXML("testRTFJapanese.rtf");
-
-        // Verify title -- this title uses upr escape inside
-        // title info field:
-        assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
-                r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
-        assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
-
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
-
-        // 6 other characters
-        assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
-    }
-
-    @Test
-    public void testMaxLength() throws Exception {
-        Metadata metadata = new Metadata();
-        InputStream stream = TikaInputStream.get(
-                getTestDocumentAsStream("testRTFJapanese.rtf"));
-
-        // Test w/ default limit:
-        Tika localTika = new Tika();
-        String content = localTika.parseToString(stream, metadata);
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() > 500);
-
-        // Test setting max length on the instance:
-        localTika.setMaxStringLength(200);
-        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
-        content = localTika.parseToString(stream, metadata);
-
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() <= 200);
-
-        // Test setting max length per-call:
-        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
-        content = localTika.parseToString(stream, metadata, 100);
-        // parseToString closes for convenience:
-        //stream.close();
-        assertTrue(content.length() <= 100);
-    }
-
-    @Test
-    public void testTextWithCurlyBraces() throws Exception {
-        XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
-        assertContains("{ some text inside curly brackets }", r.xml);
-    }
-
-    @Test
-    public void testControls() throws Exception {
-        XMLResult r = getXML("testRTFControls.rtf");
-        String content = r.xml;
-        assertContains("Thiswordhasanem\u2014dash", content);
-        assertContains("Thiswordhasanen\u2013dash", content);
-        assertContains("Thiswordhasanon\u2011breakinghyphen", content);
-        assertContains("Thiswordhasanonbreaking\u00a0space", content);
-        assertContains("Thiswordhasanoptional\u00adhyphen", content);
-        assertContains("\u2018Single quoted text\u2019", content);
-        assertContains("\u201cDouble quoted text\u201d", content);
-        assertContains("\u201cDouble quoted text again\u201d", content);
-    }
-
-    @Test
-    public void testInvalidUnicode() throws Exception {
-        XMLResult r = getXML("testRTFInvalidUnicode.rtf");
-        String content = r.xml;
-        assertContains("Unpaired hi \ufffd here", content);
-        assertContains("Unpaired lo \ufffd here", content);
-        assertContains("Mismatched pair \ufffd\ufffd here", content);
-    }
-
-    @Test
-    public void testVarious() throws Exception {
-        XMLResult r = getXML("testRTFVarious.rtf");
-        String content = r.xml;
-        assertContains("Footnote appears here", content);
-        assertContains("This is a footnote.", content);
-        assertContains("This is the header text.", content);
-        assertContains("This is the footer text.", content);
-        assertContains("Here is a text box", content);
-        assertContains("Bold", content);
-        assertContains("italic", content);
-        assertContains("underline", content);
-        assertContains("superscript", content);
-        assertContains("subscript", content);
-        assertContains("Here is a citation:", content);
-        assertContains("Figure 1 This is a caption for Figure 1", content);
-        assertContains("(Kramer)", content);
-
-        // Table
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
-
-        // 2-columns
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
-        assertContains("This is a hyperlink", content);
-        assertContains("Here is a list:", content);
-        for (int row = 1; row <= 3; row++) {
-            assertContains("Bullet " + row, content);
-        }
-        assertContains("Here is a numbered list:", content);
-        for (int row = 1; row <= 3; row++) {
-            assertContains("Number bullet " + row, content);
-        }
-
-        for (int row = 1; row <= 2; row++) {
-            for (int col = 1; col <= 3; col++) {
-                assertContains("Row " + row + " Col " + col, content);
-            }
-        }
-
-        assertContains("Keyword1 Keyword2", content);
-        assertEquals("Keyword1 Keyword2",
-                r.metadata.get(TikaCoreProperties.KEYWORDS));
-
-        assertContains("Subject is here", content);
-        assertEquals("Subject is here",
-                r.metadata.get(OfficeOpenXMLCore.SUBJECT));
-        assertEquals("Subject is here",
-                r.metadata.get(Metadata.SUBJECT));
-
-        assertContains("Suddenly some Japanese text:", content);
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
-        // 6 other characters
-        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
-
-        assertContains("And then some Gothic text:", content);
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
-    }
-
-    @Test
-    public void testVariousStyle() throws Exception {
-        String content = getXML("testRTFVarious.rtf").xml;
-        assertContains("<b>Bold</b>", content);
-        assertContains("<i>italic</i>", content);
-    }
-
-    @Test
-    public void testBoldItalic() throws Exception {
-        String content = getXML("testRTFBoldItalic.rtf").xml;
-        assertContains("<b>bold</b>", content);
-        assertContains("<b>bold </b><b><i>italic</i></b>", content);
-        assertContains("<b><i>italic </i></b><b>bold</b>", content);
-        assertContains("<i>italic</i>", content);
-        assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
-        assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
-    }
-
-    @Test
-    public void testHyperlink() throws Exception {
-        String content = getXML("testRTFHyperlink.rtf").xml;
-        assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
-        assertEquals(-1, content.indexOf("<p>\t\t</p>"));
-    }
-
-    @Test
-    public void testIgnoredControlWord() throws Exception {
-        assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
-    }
-
-    @Test
-    public void testFontAfterBufferedText() throws Exception {
-        assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
-                getXML("testFontAfterBufferedText.rtf").xml);
-    }
-
-    @Test
-    public void testListMicrosoftWord() throws Exception {
-        String content = getXML("testRTFListMicrosoftWord.rtf").xml;
-        assertContains("<ol>\t<li>one</li>", content);
-        assertContains("</ol>", content);
-        assertContains("<ul>\t<li>first</li>", content);
-        assertContains("</ul>", content);
-    }
-
-    @Test
-    public void testListLibreOffice() throws Exception {
-        String content = getXML("testRTFListLibreOffice.rtf").xml;
-        assertContains("<ol>\t<li>one</li>", content);
-        assertContains("</ol>", content);
-        assertContains("<ul>\t<li>first</li>", content);
-        assertContains("</ul>", content);
-    }
-
-    // TIKA-782
-    @Test
-    public void testBinControlWord() throws Exception {
-        ByteCopyingHandler embHandler = new ByteCopyingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, embHandler);
-        }
-        assertEquals(1, embHandler.bytes.size());
-
-        byte[] bytes = embHandler.bytes.get(0);
-        assertEquals(10, bytes.length);
-        //}
-        assertEquals(125, (int) bytes[4]);
-        //make sure that at least the last value is correct
-        assertEquals(-1, (int) bytes[9]);
-    }
-
-    // TIKA-999
-    @Test
-    public void testMetaDataCounts() throws Exception {
-        XMLResult xml = getXML("test_embedded_package.rtf");
-        assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
-        assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
-        assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
-        assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
-    }
-
-    // TIKA-1192
-    @Test
-    public void testListOverride() throws Exception {
-        assertContains("Body", getXML("testRTFListOverride.rtf").xml);
-    }
-
-    // TIKA-1305
-    @Test
-    public void testCorruptListOverride() throws Exception {
-        assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
-    }
-
-    // TIKA-1010
-    @Test
-    public void testEmbeddedMonster() throws Exception {
-
-        Map<Integer, Pair> expected = new HashMap<>();
-        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
-        expected.put(3, new Pair("file_0.doc", "application/msword"));
-        expected.put(6, new Pair("file_1.xlsx",
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
-        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
-        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
-        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
-        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
-        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
-        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
-        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
-        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
-        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
-        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
-        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
-        expected.put(36, new Pair("file_6.doc", "application/msword"));
-        expected.put(39, new Pair("file_7.doc", "application/msword"));
-        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
-        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
-
-
-        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
-        assertEquals(48, metadataList.size());
-        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
-            Metadata metadata = metadataList.get(e.getKey());
-            Pair p = e.getValue();
-            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
-            //necessary to getName() because MSOffice extractor includes
-            //directory: _1457338524/HW.txt
-            assertEquals("filename equals ",
-                    p.fileName, FilenameUtils.getName(
-                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
-
-            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
-        }
-        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
-                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
-    }
-
-    //TIKA-1010 test regular (not "embedded") images/picts
-    @Test
-    public void testRegularImages() throws Exception {
-        Parser base = new AutoDetectParser();
-        ParseContext ctx = new ParseContext();
-        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
-        ctx.set(org.apache.tika.parser.Parser.class, parser);
-        ContentHandler handler = new BodyContentHandler();
-        Metadata rootMetadata = new Metadata();
-        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
-            parser.parse(tis, handler, rootMetadata, ctx);
-        }
-        List<Metadata> metadatas = parser.getMetadata();
-
-        Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
-        Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
-        assertTrue(meta_jpg_exif != null);
-        assertTrue(meta_jpg != null);
-        // had to comment these out (when moving from 1.x to 2.x
-        // because AutoDetectParser within this module does not include image parsing.
-
-//        assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
-//        assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
-        //make sure old metadata doesn't linger between objects
-//        assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
-        assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
-        assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
-
-        assertEquals(25, meta_jpg.names().length);
-        assertEquals(25, meta_jpg_exif.names().length);
-    }
-
-    @Test
-    public void testMultipleNewlines() throws Exception {
-        String content = getXML("testRTFNewlines.rtf").xml;
-        content = content.replaceAll("[\r\n]+", " ");
-        assertContains("<body><p>one</p> " +
-                "<p /> " +
-                "<p>two</p> " +
-                "<p /> " +
-                "<p /> " +
-                "<p>three</p> " +
-                "<p /> " +
-                "<p /> " +
-                "<p /> " +
-                "<p>four</p>", content);
-    }
-
-    //TIKA-1010 test linked embedded doc
-    @Test
-    public void testEmbeddedLinkedDocument() throws Exception {
-        Set<MediaType> skipTypes = new HashSet<MediaType>();
-        skipTypes.add(MediaType.parse("application/x-emf"));
-        skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-        TrackingHandler tracker = new TrackingHandler(skipTypes);
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
-        //should gracefully skip link and not throw NPE, IOEx, etc
-        assertEquals(0, tracker.filenames.size());
-
-        tracker = new TrackingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
-        //should gracefully skip link and not throw NPE, IOEx, etc
-        assertEquals(2, tracker.filenames.size());
-    }
-
-    private static class Pair {
-        final String fileName;
-        final String mimeType;
-        Pair(String fileName, String mimeType) {
-            this.fileName = fileName;
-            this.mimeType = mimeType;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Junit test class for the Tika {@link RTFParser}
+ */
+public class RTFParserTest extends TikaTest {
+
+    private Tika tika = new Tika();
+
+    @Test
+    public void testBasicExtraction() throws Exception {
+
+        XMLResult r = getXML("testRTF.rtf");
+        assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertContains("Test", r.xml);
+        assertContains("indexation Word", r.xml);
+    }
+
+    @Test
+    public void testUmlautSpacesExtraction2() throws Exception {
+        assertContains("<p>\u00DCbersicht</p>",
+                getXML("testRTFUmlautSpaces2.rtf").xml);
+    }
+
+    @Test
+    public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
+        XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+
+        assertContains("\u5E74", r.xml);
+        assertContains("\u5ff5", r.xml);
+        assertContains("0 ", r.xml);
+        assertContains("abc", r.xml);
+        assertNotContained("\u5E74\u5E74", r.xml);
+    }
+
+    @Test
+    public void testHexEscapeInsideWord() throws Exception {
+        XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
+        assertContains("ESP\u00cdRITO", r.xml);
+    }
+
+    @Test
+    public void testWindowsCodepage1250() throws Exception {
+        XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
+        assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
+        assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
+    }
+
+    @Test
+    public void testTableCellSeparation() throws Exception {
+        String content = getXML("testRTFTableCellSeparation.rtf").xml;
+        content = content.replaceAll("(\\s|<\\/?p>)+", " ");
+        assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+    }
+
+    @Test
+    public void testTableCellSeparation2() throws Exception {
+        String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
+        // TODO: why do we insert extra whitespace...?
+        assertContains("Station</p> <p>Fax", content);
+    }
+
+    @Test
+    public void testWordPadCzechCharactersExtraction() throws Exception {
+        XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+    }
+
+    @Test
+    public void testWord2010CzechCharactersExtraction() throws Exception {
+        XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+    }
+
+    @Test
+    public void testMS932Extraction() throws Exception {
+        XMLResult r = getXML("testRTF-ms932.rtf");
+        // Hello in Japanese
+        assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
+
+        // Verify title, since it was also encoded with MS932:
+        r = getXML("testRTF-ms932.rtf");
+        assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testUmlautSpacesExtraction() throws Exception {
+        XMLResult r = getXML("testRTFUmlautSpaces.rtf");
+        assertContains("\u00DCbersicht", r.xml);
+    }
+
+    @Test
+    public void testGothic() throws Exception {
+        XMLResult r = getXML("testRTFUnicodeGothic.rtf");
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
+    }
+
+    @Test
+    public void testJapaneseText() throws Exception {
+        XMLResult r = getXML("testRTFJapanese.rtf");
+
+        // Verify title -- this title uses upr escape inside
+        // title info field:
+        assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
+                r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
+        assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
+
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
+
+        // 6 other characters
+        assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
+    }
+
+    @Test
+    public void testMaxLength() throws Exception {
+        Metadata metadata = new Metadata();
+        InputStream stream = TikaInputStream.get(
+                getTestDocumentAsStream("testRTFJapanese.rtf"));
+
+        // Test w/ default limit:
+        Tika localTika = new Tika();
+        String content = localTika.parseToString(stream, metadata);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() > 500);
+
+        // Test setting max length on the instance:
+        localTika.setMaxStringLength(200);
+        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+        content = localTika.parseToString(stream, metadata);
+
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 200);
+
+        // Test setting max length per-call:
+        stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+        content = localTika.parseToString(stream, metadata, 100);
+        // parseToString closes for convenience:
+        //stream.close();
+        assertTrue(content.length() <= 100);
+    }
+
+    @Test
+    public void testTextWithCurlyBraces() throws Exception {
+        XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
+        assertContains("{ some text inside curly brackets }", r.xml);
+    }
+
+    @Test
+    public void testControls() throws Exception {
+        XMLResult r = getXML("testRTFControls.rtf");
+        String content = r.xml;
+        assertContains("Thiswordhasanem\u2014dash", content);
+        assertContains("Thiswordhasanen\u2013dash", content);
+        assertContains("Thiswordhasanon\u2011breakinghyphen", content);
+        assertContains("Thiswordhasanonbreaking\u00a0space", content);
+        assertContains("Thiswordhasanoptional\u00adhyphen", content);
+        assertContains("\u2018Single quoted text\u2019", content);
+        assertContains("\u201cDouble quoted text\u201d", content);
+        assertContains("\u201cDouble quoted text again\u201d", content);
+    }
+
+    @Test
+    public void testInvalidUnicode() throws Exception {
+        XMLResult r = getXML("testRTFInvalidUnicode.rtf");
+        String content = r.xml;
+        assertContains("Unpaired hi \ufffd here", content);
+        assertContains("Unpaired lo \ufffd here", content);
+        assertContains("Mismatched pair \ufffd\ufffd here", content);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        XMLResult r = getXML("testRTFVarious.rtf");
+        String content = r.xml;
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+
+        // Table
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
+
+        // 2-columns
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for (int row = 1; row <= 3; row++) {
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for (int row = 1; row <= 3; row++) {
+            assertContains("Number bullet " + row, content);
+        }
+
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                r.metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                r.metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Subject is here",
+                r.metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    @Test
+    public void testVariousStyle() throws Exception {
+        String content = getXML("testRTFVarious.rtf").xml;
+        assertContains("<b>Bold</b>", content);
+        assertContains("<i>italic</i>", content);
+    }
+
+    @Test
+    public void testBoldItalic() throws Exception {
+        String content = getXML("testRTFBoldItalic.rtf").xml;
+        assertContains("<b>bold</b>", content);
+        assertContains("<b>bold </b><b><i>italic</i></b>", content);
+        assertContains("<b><i>italic </i></b><b>bold</b>", content);
+        assertContains("<i>italic</i>", content);
+        assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
+        assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
+    }
+
+    @Test
+    public void testHyperlink() throws Exception {
+        String content = getXML("testRTFHyperlink.rtf").xml;
+        assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
+        assertEquals(-1, content.indexOf("<p>\t\t</p>"));
+    }
+
+    @Test
+    public void testIgnoredControlWord() throws Exception {
+        assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
+    }
+
+    @Test
+    public void testFontAfterBufferedText() throws Exception {
+        assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
+                getXML("testFontAfterBufferedText.rtf").xml);
+    }
+
+    @Test
+    public void testListMicrosoftWord() throws Exception {
+        String content = getXML("testRTFListMicrosoftWord.rtf").xml;
+        assertContains("<ol>\t<li>one</li>", content);
+        assertContains("</ol>", content);
+        assertContains("<ul>\t<li>first</li>", content);
+        assertContains("</ul>", content);
+    }
+
+    @Test
+    public void testListLibreOffice() throws Exception {
+        String content = getXML("testRTFListLibreOffice.rtf").xml;
+        assertContains("<ol>\t<li>one</li>", content);
+        assertContains("</ol>", content);
+        assertContains("<ul>\t<li>first</li>", content);
+        assertContains("</ul>", content);
+    }
+
+    // TIKA-782
+    @Test
+    public void testBinControlWord() throws Exception {
+        ByteCopyingHandler embHandler = new ByteCopyingHandler();
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, embHandler);
+        }
+        assertEquals(1, embHandler.bytes.size());
+
+        byte[] bytes = embHandler.bytes.get(0);
+        assertEquals(10, bytes.length);
+        //}
+        assertEquals(125, (int) bytes[4]);
+        //make sure that at least the last value is correct
+        assertEquals(-1, (int) bytes[9]);
+    }
+
+    // TIKA-999
+    @Test
+    public void testMetaDataCounts() throws Exception {
+        XMLResult xml = getXML("test_embedded_package.rtf");
+        assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
+        assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
+        assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
+        assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
+    }
+
+    // TIKA-1192
+    @Test
+    public void testListOverride() throws Exception {
+        assertContains("Body", getXML("testRTFListOverride.rtf").xml);
+    }
+
+    // TIKA-1305
+    @Test
+    public void testCorruptListOverride() throws Exception {
+        assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
+    }
+
+    // TIKA-1010
+    @Test
+    public void testEmbeddedMonster() throws Exception {
+
+        Map<Integer, Pair> expected = new HashMap<>();
+        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+        expected.put(3, new Pair("file_0.doc", "application/msword"));
+        expected.put(6, new Pair("file_1.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+        expected.put(36, new Pair("file_6.doc", "application/msword"));
+        expected.put(39, new Pair("file_7.doc", "application/msword"));
+        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+        assertEquals(48, metadataList.size());
+        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+            Metadata metadata = metadataList.get(e.getKey());
+            Pair p = e.getValue();
+            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+            //necessary to getName() because MSOffice extractor includes
+            //directory: _1457338524/HW.txt
+            assertEquals("filename equals ",
+                    p.fileName, FilenameUtils.getName(
+                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
+        }
+        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+    }
+
+    //TIKA-1010 test regular (not "embedded") images/picts
+    @Test
+    public void testRegularImages() throws Exception {
+        Parser base = new AutoDetectParser();
+        ParseContext ctx = new ParseContext();
+        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ctx.set(org.apache.tika.parser.Parser.class, parser);
+        ContentHandler handler = new BodyContentHandler();
+        Metadata rootMetadata = new Metadata();
+        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
+            parser.parse(tis, handler, rootMetadata, ctx);
+        }
+        List<Metadata> metadatas = parser.getMetadata();
+
+        Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
+        Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+
+        assertTrue(meta_jpg_exif != null);
+        assertTrue(meta_jpg != null);
+        // had to comment these out (when moving from 1.x to 2.x
+        // because AutoDetectParser within this module does not include image parsing.
+
+//        assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
+//        assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
+        //make sure old metadata doesn't linger between objects
+//        assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
+        assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
+        assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
+
+        assertEquals(25, meta_jpg.names().length);
+        assertEquals(25, meta_jpg_exif.names().length);
+    }
+
+    @Test
+    public void testMultipleNewlines() throws Exception {
+        String content = getXML("testRTFNewlines.rtf").xml;
+        content = content.replaceAll("[\r\n]+", " ");
+        assertContains("<body><p>one</p> " +
+                "<p /> " +
+                "<p>two</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>three</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>four</p>", content);
+    }
+
+    //TIKA-1010 test linked embedded doc
+    @Test
+    public void testEmbeddedLinkedDocument() throws Exception {
+        Set<MediaType> skipTypes = new HashSet<MediaType>();
+        skipTypes.add(MediaType.parse("application/x-emf"));
+        skipTypes.add(MediaType.parse("application/x-msmetafile"));
+
+        TrackingHandler tracker = new TrackingHandler(skipTypes);
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(0, tracker.filenames.size());
+
+        tracker = new TrackingHandler();
+        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(2, tracker.filenames.size());
+    }
+
+    private static class Pair {
+        final String fileName;
+        final String mimeType;
+        Pair(String fileName, String mimeType) {
+            this.fileName = fileName;
+            this.mimeType = mimeType;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml
index 8d1238d..2feb22b 100644
--- a/tika-parser-modules/tika-parser-package-module/pom.xml
+++ b/tika-parser-modules/tika-parser-package-module/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
-  license agreements. See the NOTICE file distributed with this work for additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-package-module</artifactId>
-  <name>Apache Tika parser package module</name>
-  <url>http://tika.apache.org/</url>
-
-  <properties>
-    <!-- NOTE: sync tukaani version with commons-compress -->
-    <tukaani.version>1.5</tukaani.version>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.tukaani</groupId>
-      <artifactId>xz</artifactId>
-      <version>${tukaani.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.github.junrar</groupId>
-      <artifactId>junrar</artifactId>
-      <version>0.7</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-compress</artifactId>
-      <version>${commons.compress.version}</version>
-    </dependency>
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-package-module</artifactId>
+  <name>Apache Tika parser package module</name>
+  <url>http://tika.apache.org/</url>
+
+  <properties>
+    <!-- NOTE: sync tukaani version with commons-compress -->
+    <tukaani.version>1.5</tukaani.version>
+  </properties>
+
+  <dependencies>
     <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.tukaani</groupId>
+      <artifactId>xz</artifactId>
+      <version>${tukaani.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.github.junrar</groupId>
+      <artifactId>junrar</artifactId>
+      <version>0.7</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>${commons.compress.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>${codec.version}</version>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
     </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
index 0fb71fa..2345029 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pkg.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pkg.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+    @Override
+    public void start(BundleContext context) throws Exception {
+
+        registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+    }
+
+    @Override
+    public void stop(BundleContext context) throws Exception {
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
index 32f0126..4143932 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
@@ -1,112 +1,112 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.util.Locale;
-
-/**
- * Utility class to allow for conversion from an integer to Roman numerals
- * or alpha-numeric symbols in line with Pages auto numbering formats.
- */
- class AutoPageNumberUtils {
-	
-	private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
-		"H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
-		"U", "V", "W", "X", "Y", "Z" };
-	
-	private static final int MAX = 26; 
-
-	public static String asAlphaNumeric(int i) {
-		StringBuffer sbuff = new StringBuffer();
-		int index = i % MAX;
-		int ratio = i / MAX;
-		
-		if (index == 0) {
-			ratio--;
-			index = MAX;
-		}
-		
-		for(int j = 0; j <= ratio; j++) {
-			sbuff.append(ALPHABET[index - 1]);		}
-		return sbuff.toString();
-	}
-	
-	public static String asAlphaNumericLower(int i) {
-		return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
-	}
-	
-	/*
-	 * Code copied from jena.apache.org.
-	 * @see com.hp.hpl.jena.sparql.util.RomanNumeral
-	 */
-    public static String asRomanNumerals(int i) {
-        if ( i <= 0 )
-            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
-        if ( i > 3999 )
-            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
-        StringBuffer sbuff = new StringBuffer() ;
-        
-        i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
-        i = i2r(sbuff, i, "C", 100,  "XC", 90,  "L", 50,  "XL", 40 ) ;
-        i = i2r(sbuff, i, "X", 10,   "IX", 9,   "V", 5,   "IV", 4) ;
-        
-        while ( i >= 1 )
-        {
-            sbuff.append("I") ;
-            i -= 1 ;
-        }
-        return sbuff.toString() ;
-            
-        
-    }
-    
-	public static String asRomanNumeralsLower(int i) {
-		return asRomanNumerals(i).toLowerCase(Locale.ROOT);
-	}
-    
-    private static int i2r(StringBuffer sbuff, int i,
-                           String tens,  int iTens, 
-                           String nines, int iNines,
-                           String fives, int iFives,
-                           String fours, int iFours)
-    {
-        while ( i >= iTens )
-        {
-            sbuff.append(tens) ;
-            i -= iTens ;
-        }
-        
-        if ( i >= iNines )
-        {
-            sbuff.append(nines) ;
-            i -= iNines;
-        }
-
-        if ( i >= iFives )
-        {
-            sbuff.append(fives) ;
-            i -= iFives ;
-        }
-        if ( i >= iFours )
-        {
-            sbuff.append(fours) ;
-            i -= iFours ;
-        }
-        return i ;
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.util.Locale;
+
+/**
+ * Utility class to allow for conversion from an integer to Roman numerals
+ * or alpha-numeric symbols in line with Pages auto numbering formats.
+ */
+ class AutoPageNumberUtils {
+	
+	private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
+		"H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
+		"U", "V", "W", "X", "Y", "Z" };
+	
+	private static final int MAX = 26; 
+
+	public static String asAlphaNumeric(int i) {
+		StringBuffer sbuff = new StringBuffer();
+		int index = i % MAX;
+		int ratio = i / MAX;
+		
+		if (index == 0) {
+			ratio--;
+			index = MAX;
+		}
+		
+		for(int j = 0; j <= ratio; j++) {
+			sbuff.append(ALPHABET[index - 1]);		}
+		return sbuff.toString();
+	}
+	
+	public static String asAlphaNumericLower(int i) {
+		return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
+	}
+	
+	/*
+	 * Code copied from jena.apache.org.
+	 * @see com.hp.hpl.jena.sparql.util.RomanNumeral
+	 */
+    public static String asRomanNumerals(int i) {
+        if ( i <= 0 )
+            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+        if ( i > 3999 )
+            throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+        StringBuffer sbuff = new StringBuffer() ;
+        
+        i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
+        i = i2r(sbuff, i, "C", 100,  "XC", 90,  "L", 50,  "XL", 40 ) ;
+        i = i2r(sbuff, i, "X", 10,   "IX", 9,   "V", 5,   "IV", 4) ;
+        
+        while ( i >= 1 )
+        {
+            sbuff.append("I") ;
+            i -= 1 ;
+        }
+        return sbuff.toString() ;
+            
+        
+    }
+    
+	public static String asRomanNumeralsLower(int i) {
+		return asRomanNumerals(i).toLowerCase(Locale.ROOT);
+	}
+    
+    private static int i2r(StringBuffer sbuff, int i,
+                           String tens,  int iTens, 
+                           String nines, int iNines,
+                           String fives, int iFives,
+                           String fours, int iFours)
+    {
+        while ( i >= iTens )
+        {
+            sbuff.append(tens) ;
+            i -= iTens ;
+        }
+        
+        if ( i >= iNines )
+        {
+            sbuff.append(nines) ;
+            i -= iNines;
+        }
+
+        if ( i >= iFives )
+        {
+            sbuff.append(fives) ;
+            i -= iFives ;
+        }
+        if ( i >= iFours )
+        {
+            sbuff.append(fours) ;
+            i -= iFours ;
+        }
+        return i ;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 1861931..79d82e8 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -1,219 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.xml.namespace.QName;
-
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.detect.XmlRootExtractor;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
- * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
- * 
- * Currently supported formats:
- * <ol>
- * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
- * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
- * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
- * </ol>
- */
-public class IWorkPackageParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -2160322853809682372L;
-
-    /**
-     * Which files within an iWork file contain the actual content?
-     */
-    public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
-            new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
-    );
-    /**
-     * All iWork files contain one of these, so we can detect based on it
-     */
-    public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
-    
-    public enum IWORKDocumentType {
-       KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
-       NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
-       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
-       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
-       
-       private final String namespace;
-       private final String part;
-       private final MediaType type;
-       
-       IWORKDocumentType(String namespace, String part, MediaType type) {
-          this.namespace = namespace;
-          this.part = part;
-          this.type = type;
-       }
-       
-       public String getNamespace() {
-          return namespace;
-       }
-
-       public String getPart() {
-          return part;
-       }
-
-       public MediaType getType() {
-          return type;
-       }
-
-       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
-          try {
-             if (entry == null) {
-                 return null;
-             }
-
-              try (InputStream stream = zip.getInputStream(entry)) {
-                  return detectType(stream);
-              }
-          } catch (IOException e) {
-             return null;
-          }
-       }
-       
-       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
-          if (entry == null) {
-              return null;
-          }
-
-          return detectType(zip);
-       }
-       
-       private static IWORKDocumentType detectType(InputStream stream) {
-          QName qname = new XmlRootExtractor().extractRootElement(stream);
-          if (qname != null) {
-             String uri = qname.getNamespaceURI();
-             String local = qname.getLocalPart();
-            
-             for (IWORKDocumentType type : values()) {
-                if(type.getNamespace().equals(uri) && 
-                   type.getPart().equals(local)) {
-                   return type;
-                }
-             }
-          } else {
-             // There was a problem with extracting the root type
-             // Password Protected iWorks files are funny, but we can usually
-             //  spot them because they encrypt part of the zip stream 
-             try {
-                stream.read();
-             } catch(UnsupportedZipFeatureException e) {
-                // Compression field was likely encrypted
-                return ENCRYPTED;
-             } catch(Exception ignored) {
-             }
-          }
-          return null;
-       }
-    }
-
-    /**
-     * This parser handles all iWorks formats.
-     */
-    private final static Set<MediaType> supportedTypes =
-         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.application("vnd.apple.iwork"),
-                IWORKDocumentType.KEYNOTE.getType(),
-                IWORKDocumentType.NUMBERS.getType(),
-                IWORKDocumentType.PAGES.getType()
-         )));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return supportedTypes;
-    }
-
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
-        ZipArchiveEntry entry = zip.getNextZipEntry();
-
-        while (entry != null) {
-            if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
-                entry = zip.getNextZipEntry();
-                continue;
-            }
-
-            InputStream entryStream = new BufferedInputStream(zip, 4096);
-            entryStream.mark(4096);
-            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
-            entryStream.reset();
-            
-            if(type != null) {
-               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-               ContentHandler contentHandler;
-               
-               switch(type) {
-               case KEYNOTE:
-                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
-                  break;
-               case NUMBERS:
-                  contentHandler = new NumbersContentHandler(xhtml, metadata);
-                  break;
-               case PAGES:
-                  contentHandler = new PagesContentHandler(xhtml, metadata);
-                  break;
-               case ENCRYPTED:
-                   // We can't do anything for the file right now
-                   contentHandler = null;
-                   break;
-               default:
-                  throw new TikaException("Unhandled iWorks file " + type);
-               }
-
-               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
-               xhtml.startDocument();
-               if (contentHandler != null) {
-                  context.getSAXParser().parse(
-                          new CloseShieldInputStream(entryStream),
-                          new OfflineContentHandler(contentHandler)
-                  );
-               }
-               xhtml.endDocument();
-            }
-            
-            entry = zip.getNextZipEntry();
-        }
-        // Don't close the zip InputStream (TIKA-1117).
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.namespace.QName;
+
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.XmlRootExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
+ * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
+ * 
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
+ * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
+ * </ol>
+ */
+public class IWorkPackageParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -2160322853809682372L;
+
+    /**
+     * Which files within an iWork file contain the actual content?
+     */
+    public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
+            new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
+    );
+    /**
+     * All iWork files contain one of these, so we can detect based on it
+     */
+    public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
+    
+    public enum IWORKDocumentType {
+       KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
+       NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
+       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
+       
+       private final String namespace;
+       private final String part;
+       private final MediaType type;
+       
+       IWORKDocumentType(String namespace, String part, MediaType type) {
+          this.namespace = namespace;
+          this.part = part;
+          this.type = type;
+       }
+       
+       public String getNamespace() {
+          return namespace;
+       }
+
+       public String getPart() {
+          return part;
+       }
+
+       public MediaType getType() {
+          return type;
+       }
+
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
+          try {
+             if (entry == null) {
+                 return null;
+             }
+
+              try (InputStream stream = zip.getInputStream(entry)) {
+                  return detectType(stream);
+              }
+          } catch (IOException e) {
+             return null;
+          }
+       }
+       
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
+          if (entry == null) {
+              return null;
+          }
+
+          return detectType(zip);
+       }
+       
+       private static IWORKDocumentType detectType(InputStream stream) {
+          QName qname = new XmlRootExtractor().extractRootElement(stream);
+          if (qname != null) {
+             String uri = qname.getNamespaceURI();
+             String local = qname.getLocalPart();
+            
+             for (IWORKDocumentType type : values()) {
+                if(type.getNamespace().equals(uri) && 
+                   type.getPart().equals(local)) {
+                   return type;
+                }
+             }
+          } else {
+             // There was a problem with extracting the root type
+             // Password Protected iWorks files are funny, but we can usually
+             //  spot them because they encrypt part of the zip stream 
+             try {
+                stream.read();
+             } catch(UnsupportedZipFeatureException e) {
+                // Compression field was likely encrypted
+                return ENCRYPTED;
+             } catch(Exception ignored) {
+             }
+          }
+          return null;
+       }
+    }
+
+    /**
+     * This parser handles all iWorks formats.
+     */
+    private final static Set<MediaType> supportedTypes =
+         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("vnd.apple.iwork"),
+                IWORKDocumentType.KEYNOTE.getType(),
+                IWORKDocumentType.NUMBERS.getType(),
+                IWORKDocumentType.PAGES.getType()
+         )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveEntry entry = zip.getNextZipEntry();
+
+        while (entry != null) {
+            if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
+                entry = zip.getNextZipEntry();
+                continue;
+            }
+
+            InputStream entryStream = new BufferedInputStream(zip, 4096);
+            entryStream.mark(4096);
+            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            entryStream.reset();
+            
+            if(type != null) {
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+               ContentHandler contentHandler;
+               
+               switch(type) {
+               case KEYNOTE:
+                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
+                  break;
+               case NUMBERS:
+                  contentHandler = new NumbersContentHandler(xhtml, metadata);
+                  break;
+               case PAGES:
+                  contentHandler = new PagesContentHandler(xhtml, metadata);
+                  break;
+               case ENCRYPTED:
+                   // We can't do anything for the file right now
+                   contentHandler = null;
+                   break;
+               default:
+                  throw new TikaException("Unhandled iWorks file " + type);
+               }
+
+               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
+               xhtml.startDocument();
+               if (contentHandler != null) {
+                  context.getSAXParser().parse(
+                          new CloseShieldInputStream(entryStream),
+                          new OfflineContentHandler(contentHandler)
+                  );
+               }
+               xhtml.endDocument();
+            }
+            
+            entry = zip.getNextZipEntry();
+        }
+        // Don't close the zip InputStream (TIKA-1117).
+    }
+
+}

[11/39] tika git commit: Convert new lines from windows to unix

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
index dcc6508..6cda282 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
@@ -1,466 +1,466 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Tests if the IWork parser parses the content and metadata properly of the supported formats.
- */
-public class IWorkParserTest {
-
-    private IWorkPackageParser iWorkParser;
-    private ParseContext parseContext;
-
-    @Before
-    public void setUp() {
-        iWorkParser = new IWorkPackageParser();
-        parseContext = new ParseContext();
-        parseContext.set(Parser.class, new AutoDetectParser());
-    }
-
-    /**
-     * Check the given InputStream is not closed by the Parser (TIKA-1117).
-     *
-     * @throws Exception
-     */
-    @Test
-    public void testStreamNotClosed() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-        input.read();   // Will throw an Exception if the stream was already closed.
-    }
-
-    @Test
-    public void testParseKeynote() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        // Make sure enough keys came through
-        // (Exact numbers will vary based on composites)
-        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
-        List<String> metadataKeys = Arrays.asList(metadata.names());
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
-//        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-        
-        // Check the metadata values
-        assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
-        assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
-        assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
-        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
-
-        String content = handler.toString();
-        assertContains("A sample presentation", content);
-        assertContains("For the Apache Tika project", content);
-        assertContains("Slide 1", content);
-        assertContains("Some random text for the sake of testability.", content);
-        assertContains("A nice comment", content);
-        assertContains("A nice note", content);
-
-        // test table data
-        assertContains("Cell one", content);
-        assertContains("Cell two", content);
-        assertContains("Cell three", content);
-        assertContains("Cell four", content);
-        assertContains("Cell 5", content);
-        assertContains("Cell six", content);
-        assertContains("7", content);
-        assertContains("Cell eight", content);
-        assertContains("5/5/1985", content);
-    }
-
-    // TIKA-910
-    @Test
-    public void testKeynoteTextBoxes() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
-    }
-
-    // TIKA-910
-    @Test
-    public void testKeynoteBulletPoints() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
-    }
-
-    // TIKA-923
-    @Test
-    public void testKeynoteTables() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        content = content.replaceAll("\\s+", " ");
-        assertContains("row 1 row 2 row 3", content);
-    }
-
-    // TIKA-923
-    @Test
-    public void testKeynoteMasterSlideTable() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        content = content.replaceAll("\\s+", " ");
-        assertContains("master row 1", content);
-        assertContains("master row 2", content);
-        assertContains("master row 3", content);
-    }
-
-    @Test
-    public void testParsePages() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        // Make sure enough keys came through
-        // (Exact numbers will vary based on composites)
-        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
-        List<String> metadataKeys = Arrays.asList(metadata.names());
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
-        
-        // Check the metadata values
-        assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
-        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
-        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
-
-        String content = handler.toString();
-
-        // text on page 1
-        assertContains("Sample pages document", content);
-        assertContains("Some plain text to parse.", content);
-        assertContains("Cell one", content);
-        assertContains("Cell two", content);
-        assertContains("Cell three", content);
-        assertContains("Cell four", content);
-        assertContains("Cell five", content);
-        assertContains("Cell six", content);
-        assertContains("Cell seven", content);
-        assertContains("Cell eight", content);
-        assertContains("Cell nine", content);
-        assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
-
-        // text on page 2
-        assertContains("A second page....", content);
-        assertContains("Extensible Markup Language", content); // ...
-    }
-
-    // TIKA-904
-    @Test
-    public void testPagesLayoutMode() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        assertContains("text box 1 - here is some text", content);
-        assertContains("created in a text box in layout mode", content);
-        assertContains("text box 2 - more text!@!$@#", content);
-        assertContains("this is text inside of a green box", content);
-        assertContains("text inside of a green circle", content);
-    }
-
-    @Test
-    public void testParseNumbers() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        // Make sure enough keys came through
-        // (Exact numbers will vary based on composites)
-        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
-        List<String> metadataKeys = Arrays.asList(metadata.names());
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
-        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-        
-        // Check the metadata values
-        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
-        assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
-
-        String content = handler.toString();
-        assertContains("Category", content);
-        assertContains("Home", content);
-        assertContains("-226", content);
-        assertContains("-137.5", content);
-        assertContains("Checking Account: 300545668", content);
-        assertContains("4650", content);
-        assertContains("Credit Card", content);
-        assertContains("Groceries", content);
-        assertContains("-210", content);
-        assertContains("Food", content);
-        assertContains("Try adding your own account transactions to this table.", content);
-    }
-
-    // TIKA- 924
-    @Test
-    public void testParseNumbersTableNames() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-        String content = handler.toString();
-        assertContains("This is the main table", content);
-    }
-        
-    @Test
-    public void testParseNumbersTableHeaders() throws Exception {
-        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-        iWorkParser.parse(input, handler, metadata, parseContext);
-
-        String content = handler.toString();
-        for(int header=1;header<=5;header++) {
-          assertContains("header" + header, content);
-        }
-        for(int row=1;row<=3;row++) {
-          assertContains("row" + row, content);
-        }
-    }
-
-    /**
-     * We don't currently support password protected Pages files, as
-     *  we don't know how the encryption works (it's not regular Zip
-     *  Encryption). See TIKA-903 for details
-     */
-    @Test
-    public void testParsePagesPasswordProtected() throws Exception {
-       // Document password is "tika", but we can't use that yet...
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
-       Metadata metadata = new Metadata();
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, metadata, parseContext);
-
-       // Content will be empty
-       String content = handler.toString();
-       assertEquals("", content);
-       
-       // Will have been identified as encrypted
-       assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
-    }
-    
-    /**
-     * Check we get headers, footers and footnotes from Pages
-     */
-    @Test
-    public void testParsePagesHeadersFootersFootnotes() throws Exception {
-       String footnote = "Footnote: Do a lot of people really use iWork?!?!";
-       String header = "THIS IS SOME HEADER TEXT";
-       String footer = "THIS IS SOME FOOTER TEXT\t1";
-       String footer2 = "THIS IS SOME FOOTER TEXT\t2";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
-       Metadata metadata = new Metadata();
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, metadata, parseContext);
-       String contents = handler.toString();
-
-       // Check regular text
-       assertContains("Both Pages 1.x", contents); // P1
-       assertContains("understanding the Pages document", contents); // P1
-       assertContains("should be page 2", contents); // P2
-       
-       // Check for headers, footers and footnotes
-       assertContains(header, contents);
-       assertContains(footer, contents);
-       assertContains(footer2, contents);
-       assertContains(footnote, contents);
-    }
-    
-    /**
-     * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
-     */
-    @Test
-    public void testParsePagesHeadersFootersRomanUpper() throws Exception {
-       String header = "THIS IS SOME HEADER TEXT";
-       String footer = "THIS IS SOME FOOTER TEXT\tI";
-       String footer2 = "THIS IS SOME FOOTER TEXT\tII";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, new Metadata(), parseContext);
-       String contents = handler.toString();
-       
-       // Check for headers, footers and footnotes
-       assertContains(header, contents);
-       assertContains(footer, contents);
-       assertContains(footer2, contents);
-    }
-    
-    /**
-     * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
-     */
-    @Test
-    public void testParsePagesHeadersFootersRomanLower() throws Exception {
-       String header = "THIS IS SOME HEADER TEXT";
-       String footer = "THIS IS SOME FOOTER TEXT\ti";
-       String footer2 = "THIS IS SOME FOOTER TEXT\tii";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, new Metadata(), parseContext);
-       String contents = handler.toString();
-       
-       // Check for headers, footers and footnotes
-       assertContains(header, contents);
-       assertContains(footer, contents);
-       assertContains(footer2, contents);
-    }
-
-    /**
-     * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
-     */
-    @Test
-    public void testParsePagesHeadersAlphaUpper() throws Exception {
-       String header = "THIS IS SOME HEADER TEXT\tA";
-       String footer = "THIS IS SOME FOOTER TEXT\tA";
-       String footer2 = "THIS IS SOME FOOTER TEXT\tB";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, new Metadata(), parseContext);
-       String contents = handler.toString();
-       
-       // Check for headers, footers and footnotes
-       assertContains(header, contents);
-       assertContains(footer, contents);
-       assertContains(footer2, contents);
-    }
- 
-    /**
-     * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
-     */
-    @Test
-    public void testParsePagesHeadersAlphaLower() throws Exception {
-       String header = "THIS IS SOME HEADER TEXT";
-       String footer = "THIS IS SOME FOOTER TEXT\ta";
-       String footer2 = "THIS IS SOME FOOTER TEXT\tb";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, new Metadata(), parseContext);
-       String contents = handler.toString();
-       
-       // Check for headers, footers and footnotes
-       assertContains(header, contents);
-       assertContains(footer, contents);
-       assertContains(footer2, contents);
-    }
-    
-    /**
-     * Check we get annotations (eg comments) from Pages
-     */
-    @Test
-    public void testParsePagesAnnotations() throws Exception {
-       String commentA = "comment about the APXL file";
-       String commentB = "comment about UIMA";
-       
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
-       Metadata metadata = new Metadata();
-       ContentHandler handler = new BodyContentHandler();
-
-       iWorkParser.parse(input, handler, metadata, parseContext);
-       String contents = handler.toString();
-
-       // Check regular text
-       assertContains("Both Pages 1.x", contents); // P1
-       assertContains("understanding the Pages document", contents); // P1
-       assertContains("should be page 2", contents); // P2
-       
-       // Check for comments
-       assertContains(commentA, contents);
-       assertContains(commentB, contents);
-    }
-    
-    // TIKA-918
-    @Test
-    public void testNumbersExtractChartNames() throws Exception {
-       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
-       Metadata metadata = new Metadata();
-       ContentHandler handler = new BodyContentHandler();
-       iWorkParser.parse(input, handler, metadata, parseContext);
-       String contents = handler.toString();
-       assertContains("Expenditure by Category", contents);
-       assertContains("Currency Chart name", contents);
-       assertContains("Chart 2", contents);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests if the IWork parser parses the content and metadata properly of the supported formats.
+ */
+public class IWorkParserTest {
+
+    private IWorkPackageParser iWorkParser;
+    private ParseContext parseContext;
+
+    @Before
+    public void setUp() {
+        iWorkParser = new IWorkPackageParser();
+        parseContext = new ParseContext();
+        parseContext.set(Parser.class, new AutoDetectParser());
+    }
+
+    /**
+     * Check the given InputStream is not closed by the Parser (TIKA-1117).
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testStreamNotClosed() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+        input.read();   // Will throw an Exception if the stream was already closed.
+    }
+
+    @Test
+    public void testParseKeynote() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
+//        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        
+        // Check the metadata values
+        assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
+        assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+        assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("A sample presentation", content);
+        assertContains("For the Apache Tika project", content);
+        assertContains("Slide 1", content);
+        assertContains("Some random text for the sake of testability.", content);
+        assertContains("A nice comment", content);
+        assertContains("A nice note", content);
+
+        // test table data
+        assertContains("Cell one", content);
+        assertContains("Cell two", content);
+        assertContains("Cell three", content);
+        assertContains("Cell four", content);
+        assertContains("Cell 5", content);
+        assertContains("Cell six", content);
+        assertContains("7", content);
+        assertContains("Cell eight", content);
+        assertContains("5/5/1985", content);
+    }
+
+    // TIKA-910
+    @Test
+    public void testKeynoteTextBoxes() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
+    }
+
+    // TIKA-910
+    @Test
+    public void testKeynoteBulletPoints() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
+    }
+
+    // TIKA-923
+    @Test
+    public void testKeynoteTables() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        content = content.replaceAll("\\s+", " ");
+        assertContains("row 1 row 2 row 3", content);
+    }
+
+    // TIKA-923
+    @Test
+    public void testKeynoteMasterSlideTable() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        content = content.replaceAll("\\s+", " ");
+        assertContains("master row 1", content);
+        assertContains("master row 2", content);
+        assertContains("master row 3", content);
+    }
+
+    @Test
+    public void testParsePages() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
+        
+        // Check the metadata values
+        assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
+        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+
+        String content = handler.toString();
+
+        // text on page 1
+        assertContains("Sample pages document", content);
+        assertContains("Some plain text to parse.", content);
+        assertContains("Cell one", content);
+        assertContains("Cell two", content);
+        assertContains("Cell three", content);
+        assertContains("Cell four", content);
+        assertContains("Cell five", content);
+        assertContains("Cell six", content);
+        assertContains("Cell seven", content);
+        assertContains("Cell eight", content);
+        assertContains("Cell nine", content);
+        assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
+
+        // text on page 2
+        assertContains("A second page....", content);
+        assertContains("Extensible Markup Language", content); // ...
+    }
+
+    // TIKA-904
+    @Test
+    public void testPagesLayoutMode() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        assertContains("text box 1 - here is some text", content);
+        assertContains("created in a text box in layout mode", content);
+        assertContains("text box 2 - more text!@!$@#", content);
+        assertContains("this is text inside of a green box", content);
+        assertContains("text inside of a green circle", content);
+    }
+
+    @Test
+    public void testParseNumbers() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        
+        // Check the metadata values
+        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+        assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
+
+        String content = handler.toString();
+        assertContains("Category", content);
+        assertContains("Home", content);
+        assertContains("-226", content);
+        assertContains("-137.5", content);
+        assertContains("Checking Account: 300545668", content);
+        assertContains("4650", content);
+        assertContains("Credit Card", content);
+        assertContains("Groceries", content);
+        assertContains("-210", content);
+        assertContains("Food", content);
+        assertContains("Try adding your own account transactions to this table.", content);
+    }
+
+    // TIKA- 924
+    @Test
+    public void testParseNumbersTableNames() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+        String content = handler.toString();
+        assertContains("This is the main table", content);
+    }
+        
+    @Test
+    public void testParseNumbersTableHeaders() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        for(int header=1;header<=5;header++) {
+          assertContains("header" + header, content);
+        }
+        for(int row=1;row<=3;row++) {
+          assertContains("row" + row, content);
+        }
+    }
+
+    /**
+     * We don't currently support password protected Pages files, as
+     *  we don't know how the encryption works (it's not regular Zip
+     *  Encryption). See TIKA-903 for details
+     */
+    @Test
+    public void testParsePagesPasswordProtected() throws Exception {
+       // Document password is "tika", but we can't use that yet...
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+
+       // Content will be empty
+       String content = handler.toString();
+       assertEquals("", content);
+       
+       // Will have been identified as encrypted
+       assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+    }
+    
+    /**
+     * Check we get headers, footers and footnotes from Pages
+     */
+    @Test
+    public void testParsePagesHeadersFootersFootnotes() throws Exception {
+       String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\t1";
+       String footer2 = "THIS IS SOME FOOTER TEXT\t2";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+       String contents = handler.toString();
+
+       // Check regular text
+       assertContains("Both Pages 1.x", contents); // P1
+       assertContains("understanding the Pages document", contents); // P1
+       assertContains("should be page 2", contents); // P2
+       
+       // Check for headers, footers and footnotes
+       assertContains(header, contents);
+       assertContains(footer, contents);
+       assertContains(footer2, contents);
+       assertContains(footnote, contents);
+    }
+    
+    /**
+     * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersFootersRomanUpper() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\tI";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tII";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, new Metadata(), parseContext);
+       String contents = handler.toString();
+       
+       // Check for headers, footers and footnotes
+       assertContains(header, contents);
+       assertContains(footer, contents);
+       assertContains(footer2, contents);
+    }
+    
+    /**
+     * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersFootersRomanLower() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\ti";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tii";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, new Metadata(), parseContext);
+       String contents = handler.toString();
+       
+       // Check for headers, footers and footnotes
+       assertContains(header, contents);
+       assertContains(footer, contents);
+       assertContains(footer2, contents);
+    }
+
+    /**
+     * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersAlphaUpper() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT\tA";
+       String footer = "THIS IS SOME FOOTER TEXT\tA";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tB";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, new Metadata(), parseContext);
+       String contents = handler.toString();
+       
+       // Check for headers, footers and footnotes
+       assertContains(header, contents);
+       assertContains(footer, contents);
+       assertContains(footer2, contents);
+    }
+ 
+    /**
+     * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersAlphaLower() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\ta";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tb";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, new Metadata(), parseContext);
+       String contents = handler.toString();
+       
+       // Check for headers, footers and footnotes
+       assertContains(header, contents);
+       assertContains(footer, contents);
+       assertContains(footer2, contents);
+    }
+    
+    /**
+     * Check we get annotations (eg comments) from Pages
+     */
+    @Test
+    public void testParsePagesAnnotations() throws Exception {
+       String commentA = "comment about the APXL file";
+       String commentB = "comment about UIMA";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+       String contents = handler.toString();
+
+       // Check regular text
+       assertContains("Both Pages 1.x", contents); // P1
+       assertContains("understanding the Pages document", contents); // P1
+       assertContains("should be page 2", contents); // P2
+       
+       // Check for comments
+       assertContains(commentA, contents);
+       assertContains(commentB, contents);
+    }
+    
+    // TIKA-918
+    @Test
+    public void testNumbersExtractChartNames() throws Exception {
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+       iWorkParser.parse(input, handler, metadata, parseContext);
+       String contents = handler.toString();
+       assertContains("Expenditure by Category", contents);
+       assertContains("Currency Chart name", contents);
+       assertContains("Chart 2", contents);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
index 95bd87c..6fad531 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Before;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parent class for all Package based Test cases
- */
-public abstract class AbstractPkgTest extends TikaTest {
-   protected ParseContext trackingContext;
-   protected ParseContext recursingContext;
-   
-   protected Parser autoDetectParser;
-   protected EmbeddedTrackingParser tracker;
-
-   @Before
-   public void setUp() throws Exception {
-      tracker = new EmbeddedTrackingParser();
-      trackingContext = new ParseContext();
-      trackingContext.set(Parser.class, tracker);
-      
-      autoDetectParser = new AutoDetectParser();
-      recursingContext = new ParseContext();
-      recursingContext.set(Parser.class, autoDetectParser);
-   }
-
-
-   @SuppressWarnings("serial")
-   protected static class EmbeddedTrackingParser extends AbstractParser {
-      protected List<String> filenames = new ArrayList<String>();
-      protected List<String> mediatypes = new ArrayList<String>();
-      protected List<String> createdAts = new ArrayList<String>();
-      protected List<String> modifiedAts = new ArrayList<String>();
-      protected byte[] lastSeenStart;
-      
-      public void reset() {
-         filenames.clear();
-         mediatypes.clear();
-         createdAts.clear();
-         modifiedAts.clear();
-      }
-      
-      public Set<MediaType> getSupportedTypes(ParseContext context) {
-         // Cheat!
-         return (new AutoDetectParser()).getSupportedTypes(context);
-      }
-
-      public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-         filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
-         mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
-         createdAts.add(metadata.get(TikaCoreProperties.CREATED));
-         modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
-         
-         lastSeenStart = new byte[32];
-         stream.read(lastSeenStart);
-      }
-
-   }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Before;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parent class for all Package based Test cases
+ */
+public abstract class AbstractPkgTest extends TikaTest {
+   protected ParseContext trackingContext;
+   protected ParseContext recursingContext;
+   
+   protected Parser autoDetectParser;
+   protected EmbeddedTrackingParser tracker;
+
+   @Before
+   public void setUp() throws Exception {
+      tracker = new EmbeddedTrackingParser();
+      trackingContext = new ParseContext();
+      trackingContext.set(Parser.class, tracker);
+      
+      autoDetectParser = new AutoDetectParser();
+      recursingContext = new ParseContext();
+      recursingContext.set(Parser.class, autoDetectParser);
+   }
+
+
+   @SuppressWarnings("serial")
+   protected static class EmbeddedTrackingParser extends AbstractParser {
+      protected List<String> filenames = new ArrayList<String>();
+      protected List<String> mediatypes = new ArrayList<String>();
+      protected List<String> createdAts = new ArrayList<String>();
+      protected List<String> modifiedAts = new ArrayList<String>();
+      protected byte[] lastSeenStart;
+      
+      public void reset() {
+         filenames.clear();
+         mediatypes.clear();
+         createdAts.clear();
+         modifiedAts.clear();
+      }
+      
+      public Set<MediaType> getSupportedTypes(ParseContext context) {
+         // Cheat!
+         return (new AutoDetectParser()).getSupportedTypes(context);
+      }
+
+      public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+         filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
+         mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+         createdAts.add(metadata.get(TikaCoreProperties.CREATED));
+         modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
+         
+         lastSeenStart = new byte[32];
+         stream.read(lastSeenStart);
+      }
+
+   }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
index 3dc01f6..42b60da 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
@@ -1,89 +1,89 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing bzip2 files.
- */
-public class Bzip2ParserTest extends AbstractPkgTest {
-
-    @Test
-    public void testBzip2Parsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tbz2")) {
-            parser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
-        String content = handler.toString();
-        assertContains("test-documents/testEXCEL.xls", content);
-        assertContains("test-documents/testHTML.html", content);
-        assertContains("test-documents/testOpenOffice2.odt", content);
-        assertContains("test-documents/testPDF.pdf", content);
-        assertContains("test-documents/testPPT.ppt", content);
-        assertContains("test-documents/testRTF.rtf", content);
-        assertContains("test-documents/testTXT.txt", content);
-        assertContains("test-documents/testWORD.doc", content);
-        assertContains("test-documents/testXML.xml", content);
-    }
-
-
-    /**
-     * Tests that the ParseContext parser is correctly
-     *  fired for all the embedded entries.
-     */
-    @Test
-    public void testEmbedded() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tbz2")) {
-            parser.parse(stream, handler, metadata, trackingContext);
-        }
-       
-       // Should find a single entry, for the (compressed) tar file
-       assertEquals(1, tracker.filenames.size());
-       assertEquals(1, tracker.mediatypes.size());
-       assertEquals(1, tracker.modifiedAts.size());
-       
-       assertEquals(null, tracker.filenames.get(0));
-       assertEquals(null, tracker.mediatypes.get(0));
-       assertEquals(null, tracker.createdAts.get(0));
-       assertEquals(null, tracker.modifiedAts.get(0));
-
-       // Tar file starts with the directory name
-       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing bzip2 files.
+ */
+public class Bzip2ParserTest extends AbstractPkgTest {
+
+    @Test
+    public void testBzip2Parsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tbz2")) {
+            parser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        assertContains("test-documents/testEXCEL.xls", content);
+        assertContains("test-documents/testHTML.html", content);
+        assertContains("test-documents/testOpenOffice2.odt", content);
+        assertContains("test-documents/testPDF.pdf", content);
+        assertContains("test-documents/testPPT.ppt", content);
+        assertContains("test-documents/testRTF.rtf", content);
+        assertContains("test-documents/testTXT.txt", content);
+        assertContains("test-documents/testWORD.doc", content);
+        assertContains("test-documents/testXML.xml", content);
+    }
+
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tbz2")) {
+            parser.parse(stream, handler, metadata, trackingContext);
+        }
+       
+       // Should find a single entry, for the (compressed) tar file
+       assertEquals(1, tracker.filenames.size());
+       assertEquals(1, tracker.mediatypes.size());
+       assertEquals(1, tracker.modifiedAts.size());
+       
+       assertEquals(null, tracker.filenames.get(0));
+       assertEquals(null, tracker.mediatypes.get(0));
+       assertEquals(null, tracker.createdAts.get(0));
+       assertEquals(null, tracker.modifiedAts.get(0));
+
+       // Tar file starts with the directory name
+       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index 0439a38..378a0fc 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing gzip files.
- */
-public class GzipParserTest extends AbstractPkgTest {
-
-    @Test
-    public void testGzipParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = GzipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tgz")) {
-            parser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
-        String content = handler.toString();
-        assertContains("test-documents/testEXCEL.xls", content);
-        assertContains("test-documents/testHTML.html", content);
-        assertContains("test-documents/testOpenOffice2.odt", content);
-        assertContains("test-documents/testPDF.pdf", content);
-        assertContains("test-documents/testPPT.ppt", content);
-        assertContains("test-documents/testRTF.rtf", content);
-        assertContains("test-documents/testTXT.txt", content);
-        assertContains("test-documents/testWORD.doc", content);
-        assertContains("test-documents/testXML.xml", content);
-    }
-
-    /**
-     * Tests that the ParseContext parser is correctly
-     *  fired for all the embedded entries.
-     */
-    @Test
-    public void testEmbedded() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tgz")) {
-            parser.parse(stream, handler, metadata, trackingContext);
-        }
-       
-       // Should find a single entry, for the (compressed) tar file
-       assertEquals(1, tracker.filenames.size());
-       assertEquals(1, tracker.mediatypes.size());
-       assertEquals(1, tracker.modifiedAts.size());
-       
-       assertEquals(null, tracker.filenames.get(0));
-       assertEquals(null, tracker.mediatypes.get(0));
-       assertEquals(null, tracker.modifiedAts.get(0));
-
-       // Tar file starts with the directory name
-       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
-    }
-    
-    @Test
-    public void testSvgzParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = GzipParserTest.class.getResourceAsStream(
-                "/test-documents/testSVG.svgz")) {
-            parser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing gzip files.
+ */
+public class GzipParserTest extends AbstractPkgTest {
+
+    @Test
+    public void testGzipParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tgz")) {
+            parser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        assertContains("test-documents/testEXCEL.xls", content);
+        assertContains("test-documents/testHTML.html", content);
+        assertContains("test-documents/testOpenOffice2.odt", content);
+        assertContains("test-documents/testPDF.pdf", content);
+        assertContains("test-documents/testPPT.ppt", content);
+        assertContains("test-documents/testRTF.rtf", content);
+        assertContains("test-documents/testTXT.txt", content);
+        assertContains("test-documents/testWORD.doc", content);
+        assertContains("test-documents/testXML.xml", content);
+    }
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tgz")) {
+            parser.parse(stream, handler, metadata, trackingContext);
+        }
+       
+       // Should find a single entry, for the (compressed) tar file
+       assertEquals(1, tracker.filenames.size());
+       assertEquals(1, tracker.mediatypes.size());
+       assertEquals(1, tracker.modifiedAts.size());
+       
+       assertEquals(null, tracker.filenames.get(0));
+       assertEquals(null, tracker.mediatypes.get(0));
+       assertEquals(null, tracker.modifiedAts.get(0));
+
+       // Tar file starts with the directory name
+       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+    }
+    
+    @Test
+    public void testSvgzParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+                "/test-documents/testSVG.svgz")) {
+            parser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
index 95126ed..35ab265 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
@@ -1,105 +1,105 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing tar files.
- */
-public class TarParserTest extends AbstractPkgTest {
-
-    @Test
-    public void testTarParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = TarParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tar")) {
-            parser.parse(stream, handler, metadata, recursingContext);
-        }
-
-        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
-        String content = handler.toString();
-        assertContains("test-documents/testEXCEL.xls", content);
-        assertContains("test-documents/testHTML.html", content);
-        assertContains("test-documents/testOpenOffice2.odt", content);
-        assertContains("test-documents/testPDF.pdf", content);
-        assertContains("test-documents/testPPT.ppt", content);
-        assertContains("test-documents/testRTF.rtf", content);
-        assertContains("test-documents/testTXT.txt", content);
-        assertContains("test-documents/testWORD.doc", content);
-        assertContains("test-documents/testXML.xml", content);
-    }
-
-    /**
-     * Tests that the ParseContext parser is correctly
-     *  fired for all the embedded entries.
-     */
-    @Test
-    public void testEmbedded() throws Exception {
-       Parser parser = new AutoDetectParser(); // Should auto-detect!
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
-
-        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents.tar")) {
-            parser.parse(stream, handler, metadata, trackingContext);
-        }
-       
-       // Should have found all 9 documents, but not the directory
-       assertEquals(9, tracker.filenames.size());
-       assertEquals(9, tracker.mediatypes.size());
-       assertEquals(9, tracker.modifiedAts.size());
-       
-       // Should have names but not content types, as tar doesn't
-       //  store the content types
-       assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
-       assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
-       assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
-       assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
-       assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
-       assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
-       assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
-       assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
-       assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
-       
-       for(String type : tracker.mediatypes) {
-          assertNull(type);
-       }
-       for(String crt : tracker.createdAts) {
-           assertNull(crt);
-       }
-       for(String mod : tracker.modifiedAts) {
-           assertNotNull(mod);
-           assertTrue("Modified at " + mod, mod.startsWith("20"));
-       }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing tar files.
+ */
+public class TarParserTest extends AbstractPkgTest {
+
+    @Test
+    public void testTarParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = TarParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tar")) {
+            parser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        assertContains("test-documents/testEXCEL.xls", content);
+        assertContains("test-documents/testHTML.html", content);
+        assertContains("test-documents/testOpenOffice2.odt", content);
+        assertContains("test-documents/testPDF.pdf", content);
+        assertContains("test-documents/testPPT.ppt", content);
+        assertContains("test-documents/testRTF.rtf", content);
+        assertContains("test-documents/testTXT.txt", content);
+        assertContains("test-documents/testWORD.doc", content);
+        assertContains("test-documents/testXML.xml", content);
+    }
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tar")) {
+            parser.parse(stream, handler, metadata, trackingContext);
+        }
+       
+       // Should have found all 9 documents, but not the directory
+       assertEquals(9, tracker.filenames.size());
+       assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
+       
+       // Should have names but not content types, as tar doesn't
+       //  store the content types
+       assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+       assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+       assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+       assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+       assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+       assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+       assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+       assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+       assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+       
+       for(String type : tracker.mediatypes) {
+          assertNull(type);
+       }
+       for(String crt : tracker.createdAts) {
+           assertNull(crt);
+       }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
+    }
+}