You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:07 UTC
[01/39] tika git commit: Convert new lines from windows to unix
Repository: tika
Updated Branches:
refs/heads/2.x dd3c2a486 -> c7a6bcac4
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index d1e1463..ee9a98b 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -1,485 +1,485 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.*;
-import static org.junit.Assume.assumeTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-
-import org.apache.james.mime4j.stream.MimeConfig;
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class RFC822ParserTest extends TikaTest {
-
- private static InputStream getStream(String name) {
- InputStream stream = Thread.currentThread().getContextClassLoader()
- .getResourceAsStream(name);
- assertNotNull("Test file not found " + name, stream);
- return stream;
- }
-
- @Test
- public void testSimple() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822");
- ContentHandler handler = mock(DefaultHandler.class);
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- verify(handler).startDocument();
- //just one body
- verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
- verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
- //no multi-part body parts
- verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
- verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
- verify(handler).endDocument();
- //note no leading spaces, and no quotes
- assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
- metadata.get(Metadata.SUBJECT));
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- @Test
- public void testMultipart() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822-multipart");
- ContentHandler handler = mock(XHTMLContentHandler.class);
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- verify(handler).startDocument();
- int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
- verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
- verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
- verify(handler).endDocument();
-
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
-
- //repeat, this time looking at content
- parser = new RFC822Parser();
- metadata = new Metadata();
- stream = getStream("test-documents/testRFC822-multipart");
- handler = new BodyContentHandler();
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
- String bodyText = handler.toString();
- assertTrue(bodyText.contains("body 1"));
- assertTrue(bodyText.contains("body 2"));
- assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- @Test
- public void testQuotedPrintable() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822_quoted");
- ContentHandler handler = new BodyContentHandler();
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
- String bodyText = handler.toString();
- assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
- assertTrue(bodyText.contains("Lines can be split like this."));
- assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
- assertFalse(bodyText.contains("=")); //there should be no escape sequences
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- @Test
- public void testBase64() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822_base64");
- ContentHandler handler = new BodyContentHandler();
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
- assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString());
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- @Test
- public void testI18NHeaders() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
- ContentHandler handler = mock(DefaultHandler.class);
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- //tests correct decoding of internationalized headers, both
- //quoted-printable (Q) and Base64 (B).
- assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("If you can read this you understand the example.",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("If you can read this you understand the example.",
- metadata.get(Metadata.SUBJECT));
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- /**
- * The from isn't in the usual form.
- * See TIKA-618
- */
- @Test
- public void testUnusualFromAddress() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822_oddfrom");
- ContentHandler handler = mock(DefaultHandler.class);
-
- parser.parse(stream, handler, metadata, new ParseContext());
- assertEquals("Saved by Windows Internet Explorer 7",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Air Permit Programs | Air & Radiation | US EPA",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Air Permit Programs | Air & Radiation | US EPA",
- metadata.get(Metadata.SUBJECT));
- }
-
- /**
- * Test for TIKA-640, increase header max beyond 10k bytes
- */
- @Test
- public void testLongHeader() throws Exception {
- StringBuilder inputBuilder = new StringBuilder();
- for (int i = 0; i < 2000; ++i) {
- inputBuilder.append( //len > 50
- "really really really really really really long name ");
- }
- String name = inputBuilder.toString();
- byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII);
-
- Parser parser = new RFC822Parser();
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
- try {
- parser.parse(
- new ByteArrayInputStream(data), handler, metadata, context);
- fail();
- } catch (TikaException expected) {
- }
-
- MimeConfig config = new MimeConfig();
- config.setMaxHeaderLen(-1);
- config.setMaxLineLen(-1);
- context.set(MimeConfig.class, config);
- parser.parse(
- new ByteArrayInputStream(data), handler, metadata, context);
- assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
- }
-
- /**
- * Test for TIKA-678 - not all headers may be present
- */
- @Test
- public void testSomeMissingHeaders() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
- ContentHandler handler = new BodyContentHandler();
-
- parser.parse(stream, handler, metadata, new ParseContext());
- assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
- assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
- assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
- assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
- assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
- assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
- assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
- assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
- assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
- assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("abcd", metadata.get(Metadata.SUBJECT));
- assertContains("bar biz bat", handler.toString());
- }
-
- /**
- * Test TIKA-1028 - If the mail contains an encrypted attachment (or
- * an attachment that others triggers an error), parsing should carry
- * on for the remainder regardless
- */
- @Test
- public void testEncryptedZipAttachment() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
- ContentHandler handler = new BodyContentHandler();
- parser.parse(stream, handler, metadata, context);
-
- // Check we go the metadata
- assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
- assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
- // Check we got the message text, for both Plain Text and HTML
- assertContains("Includes encrypted zip file", handler.toString());
- assertContains("password is \"test\".", handler.toString());
- assertContains("This is the Plain Text part", handler.toString());
- assertContains("This is the HTML part", handler.toString());
-
- // We won't get the contents of the zip file, but we will get the name
- assertContains("text.txt", handler.toString());
- assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
-
- // Try again, this time with the password supplied
- // Check that we also get the zip's contents as well
- context.set(PasswordProvider.class, new PasswordProvider() {
- public String getPassword(Metadata metadata) {
- return "test";
- }
- });
- stream = getStream("test-documents/testRFC822_encrypted_zip");
- handler = new BodyContentHandler();
- parser.parse(stream, handler, metadata, context);
-
- assertContains("Includes encrypted zip file", handler.toString());
- assertContains("password is \"test\".", handler.toString());
- assertContains("This is the Plain Text part", handler.toString());
- assertContains("This is the HTML part", handler.toString());
-
- // We do get the name of the file in the encrypted zip file
- assertContains("text.txt", handler.toString());
-
- // TODO Upgrade to a version of Commons Compress with Encryption
- // support, then verify we get the contents of the text file
- // held within the encrypted zip
- assumeTrue(false); // No Zip Encryption support yet
- assertContains("TEST DATA FOR TIKA.", handler.toString());
- assertContains("ENCRYPTED ZIP FILES", handler.toString());
- assertContains("TIKA-1028", handler.toString());
- }
-
- /**
- * Test TIKA-1028 - Ensure we can get the contents of an
- * un-encrypted zip file
- */
- @Test
- public void testNormalZipAttachment() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- InputStream stream = getStream("test-documents/testRFC822_normal_zip");
- ContentHandler handler = new BodyContentHandler();
- parser.parse(stream, handler, metadata, context);
-
- // Check we go the metadata
- assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
- assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
- // Check we got the message text, for both Plain Text and HTML
- assertContains("Includes a normal, unencrypted zip file", handler.toString());
- assertContains("This is the Plain Text part", handler.toString());
- assertContains("This is the HTML part", handler.toString());
-
- // We get both name and contents of the zip file's contents
- assertContains("text.txt", handler.toString());
- assertContains("TEST DATA FOR TIKA.", handler.toString());
- assertContains("This is text inside an unencrypted zip file", handler.toString());
- assertContains("TIKA-1028", handler.toString());
- }
-
- /**
- * TIKA-1222 When requested, ensure that the various attachments of
- * the mail come through properly as embedded resources
- */
- @Test
- public void testGetAttachmentsAsEmbeddedResources() throws Exception {
- TrackingHandler tracker = new TrackingHandler();
- ContainerExtractor ex = new ParserContainerExtractor();
- try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
- }
-
- // Check we found all 3 parts
- assertEquals(3, tracker.filenames.size());
- assertEquals(3, tracker.mediaTypes.size());
-
- // No filenames available
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.filenames.get(1));
- assertEquals(null, tracker.filenames.get(2));
- // Types are available
- assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
- assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
- assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
- }
-
- @Test
- public void testDetection() throws Exception {
- //test simple text file
- XMLResult r = getXML("testRFC822_date_utf8");
- assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
-
- //test without extension
- r = getXML("testRFC822_eml");
- assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
- }
-
- @Test
- public void testDates() throws Exception {
- //tests non-standard dates that mime4j can't parse
- XMLResult r = getXML("testRFC822_date_utf8");
- assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
-
- r = getXML("testRFC822_eml");
- assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
-
-
- String expected = "2016-05-15T01:32:00Z";
-
- for (String dateString : new String[]{
- "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works
- "Sun, 15 May 2016 01:32:00", //no timezone
- "Sunday, May 15 2016 1:32 AM",
- "May 15 2016 1:32am",
- "May 15 2016 1:32 am",
- "2016-05-15 01:32:00",
- " Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace
- " Sun, 14 May 2016 20:32:00 EST",
- }) {
- testDate(dateString, expected);
- }
-
- //now try days without times
- expected = "2016-05-15T12:00:00Z";
- for (String dateString : new String[]{
- "May 15, 2016",
- "Sun, 15 May 2016",
- "15 May 2016",
- }) {
- testDate(dateString, expected);
- }
- }
-
- @Test
- public void testTrickyDates() throws Exception {
- DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
- //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
- Date date1980 = df.parse("1980-01-01");
- for (String dateString : new String[] {
- "Mon, 29 Jan 96 14:02 GMT",
- "7/20/95 1:12pm",
- "08/14/2000 12:48 AM",
- "06/24/2008, Tuesday, 11 AM",
- "11/14/08",
- "12/02/1996",
- "96/12/02",
- }) {
- Date parsedDate = getDate(dateString);
- if (parsedDate != null) {
- assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime());
- }
- }
- //TODO: mime4j misparses these to pre 1980 dates
- //"Wed, 27 Dec 95 11:20:40 EST",
- //"26 Aug 00 11:14:52 EDT"
- //
- //We are still misparsing: 8/1/03 to a pre 1980 date
-
- }
-
- private void testDate(String dateString, String expected) throws Exception {
- Date parsedDate = getDate(dateString);
- assertNotNull("couldn't parse " + dateString, parsedDate);
- DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'",
- new DateFormatSymbols(Locale.US));
- String parsedDateString = df.format(parsedDate);
- assertEquals("failed to match: "+dateString, expected, parsedDateString);
- }
-
- private Date getDate(String dateString) throws Exception {
- String mail = "From: dev@tika.apache.org\n"+
- "Date: "+dateString+"\n";
- Parser p = new RFC822Parser();
- Metadata m = new Metadata();
- try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
- p.parse(is, new DefaultHandler(), m, new ParseContext());
- }
- return m.getDate(TikaCoreProperties.CREATED);
- }
-
- @Test
- public void testMultipleSubjects() throws Exception {
- //adapted from govdocs1 303710.txt
- String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" +
- "Subject: 2006N-3502\n" +
- "Subject: I Urge You to Require Notice of Mercury";
- Parser p = new RFC822Parser();
- Metadata m = new Metadata();
- p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext());
- assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.*;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RFC822ParserTest extends TikaTest {
+
+ private static InputStream getStream(String name) {
+ InputStream stream = Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ assertNotNull("Test file not found " + name, stream);
+ return stream;
+ }
+
+ @Test
+ public void testSimple() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ verify(handler).startDocument();
+ //just one body
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
+ //no multi-part body parts
+ verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+ verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
+ verify(handler).endDocument();
+ //note no leading spaces, and no quotes
+ assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+ metadata.get(Metadata.SUBJECT));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void testMultipart() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822-multipart");
+ ContentHandler handler = mock(XHTMLContentHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ verify(handler).startDocument();
+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+ verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+ verify(handler).endDocument();
+
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+
+ //repeat, this time looking at content
+ parser = new RFC822Parser();
+ metadata = new Metadata();
+ stream = getStream("test-documents/testRFC822-multipart");
+ handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+ String bodyText = handler.toString();
+ assertTrue(bodyText.contains("body 1"));
+ assertTrue(bodyText.contains("body 2"));
+ assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void testQuotedPrintable() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_quoted");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+ String bodyText = handler.toString();
+ assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
+ assertTrue(bodyText.contains("Lines can be split like this."));
+ assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
+ assertFalse(bodyText.contains("=")); //there should be no escape sequences
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void testBase64() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_base64");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
+ assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString());
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void testI18NHeaders() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of internationalized headers, both
+ //quoted-printable (Q) and Base64 (B).
+ assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("If you can read this you understand the example.",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("If you can read this you understand the example.",
+ metadata.get(Metadata.SUBJECT));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ /**
+ * The from isn't in the usual form.
+ * See TIKA-618
+ */
+ @Test
+ public void testUnusualFromAddress() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_oddfrom");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals("Saved by Windows Internet Explorer 7",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+ metadata.get(Metadata.SUBJECT));
+ }
+
+ /**
+ * Test for TIKA-640, increase header max beyond 10k bytes
+ */
+ @Test
+ public void testLongHeader() throws Exception {
+ StringBuilder inputBuilder = new StringBuilder();
+ for (int i = 0; i < 2000; ++i) {
+ inputBuilder.append( //len > 50
+ "really really really really really really long name ");
+ }
+ String name = inputBuilder.toString();
+ byte[] data = ("From: " + name + "\r\n\r\n").getBytes(US_ASCII);
+
+ Parser parser = new RFC822Parser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ try {
+ parser.parse(
+ new ByteArrayInputStream(data), handler, metadata, context);
+ fail();
+ } catch (TikaException expected) {
+ }
+
+ MimeConfig config = new MimeConfig();
+ config.setMaxHeaderLen(-1);
+ config.setMaxLineLen(-1);
+ context.set(MimeConfig.class, config);
+ parser.parse(
+ new ByteArrayInputStream(data), handler, metadata, context);
+ assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
+ }
+
+ /**
+ * Test for TIKA-678 - not all headers may be present
+ */
+ @Test
+ public void testSomeMissingHeaders() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
+ ContentHandler handler = new BodyContentHandler();
+
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
+ assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
+ assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
+ assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+ assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+ assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
+ assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
+ assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
+ assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
+ assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("abcd", metadata.get(Metadata.SUBJECT));
+ assertContains("bar biz bat", handler.toString());
+ }
+
+ /**
+ * Test TIKA-1028 - If the mail contains an encrypted attachment (or
+ * an attachment that others triggers an error), parsing should carry
+ * on for the remainder regardless
+ */
+ @Test
+ public void testEncryptedZipAttachment() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(stream, handler, metadata, context);
+
+ // Check we go the metadata
+ assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check we got the message text, for both Plain Text and HTML
+ assertContains("Includes encrypted zip file", handler.toString());
+ assertContains("password is \"test\".", handler.toString());
+ assertContains("This is the Plain Text part", handler.toString());
+ assertContains("This is the HTML part", handler.toString());
+
+ // We won't get the contents of the zip file, but we will get the name
+ assertContains("text.txt", handler.toString());
+ assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
+
+ // Try again, this time with the password supplied
+ // Check that we also get the zip's contents as well
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ public String getPassword(Metadata metadata) {
+ return "test";
+ }
+ });
+ stream = getStream("test-documents/testRFC822_encrypted_zip");
+ handler = new BodyContentHandler();
+ parser.parse(stream, handler, metadata, context);
+
+ assertContains("Includes encrypted zip file", handler.toString());
+ assertContains("password is \"test\".", handler.toString());
+ assertContains("This is the Plain Text part", handler.toString());
+ assertContains("This is the HTML part", handler.toString());
+
+ // We do get the name of the file in the encrypted zip file
+ assertContains("text.txt", handler.toString());
+
+ // TODO Upgrade to a version of Commons Compress with Encryption
+ // support, then verify we get the contents of the text file
+ // held within the encrypted zip
+ assumeTrue(false); // No Zip Encryption support yet
+ assertContains("TEST DATA FOR TIKA.", handler.toString());
+ assertContains("ENCRYPTED ZIP FILES", handler.toString());
+ assertContains("TIKA-1028", handler.toString());
+ }
+
+ /**
+ * Test TIKA-1028 - Ensure we can get the contents of an
+ * un-encrypted zip file
+ */
+ @Test
+ public void testNormalZipAttachment() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = getStream("test-documents/testRFC822_normal_zip");
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(stream, handler, metadata, context);
+
+ // Check we go the metadata
+ assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check we got the message text, for both Plain Text and HTML
+ assertContains("Includes a normal, unencrypted zip file", handler.toString());
+ assertContains("This is the Plain Text part", handler.toString());
+ assertContains("This is the HTML part", handler.toString());
+
+ // We get both name and contents of the zip file's contents
+ assertContains("text.txt", handler.toString());
+ assertContains("TEST DATA FOR TIKA.", handler.toString());
+ assertContains("This is text inside an unencrypted zip file", handler.toString());
+ assertContains("TIKA-1028", handler.toString());
+ }
+
+ /**
+ * TIKA-1222 When requested, ensure that the various attachments of
+ * the mail come through properly as embedded resources
+ */
+ @Test
+ public void testGetAttachmentsAsEmbeddedResources() throws Exception {
+ TrackingHandler tracker = new TrackingHandler();
+ ContainerExtractor ex = new ParserContainerExtractor();
+ try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
+ assertEquals(true, ex.isSupported(tis));
+ ex.extract(tis, ex, tracker);
+ }
+
+ // Check we found all 3 parts
+ assertEquals(3, tracker.filenames.size());
+ assertEquals(3, tracker.mediaTypes.size());
+
+ // No filenames available
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.filenames.get(1));
+ assertEquals(null, tracker.filenames.get(2));
+ // Types are available
+ assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
+ assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
+ assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
+ }
+
+ @Test
+ public void testDetection() throws Exception {
+ //test simple text file
+ XMLResult r = getXML("testRFC822_date_utf8");
+ assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
+
+ //test without extension
+ r = getXML("testRFC822_eml");
+ assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testDates() throws Exception {
+ //tests non-standard dates that mime4j can't parse
+ XMLResult r = getXML("testRFC822_date_utf8");
+ assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+ r = getXML("testRFC822_eml");
+ assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+
+ String expected = "2016-05-15T01:32:00Z";
+
+ for (String dateString : new String[]{
+ "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works
+ "Sun, 15 May 2016 01:32:00", //no timezone
+ "Sunday, May 15 2016 1:32 AM",
+ "May 15 2016 1:32am",
+ "May 15 2016 1:32 am",
+ "2016-05-15 01:32:00",
+ " Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace
+ " Sun, 14 May 2016 20:32:00 EST",
+ }) {
+ testDate(dateString, expected);
+ }
+
+ //now try days without times
+ expected = "2016-05-15T12:00:00Z";
+ for (String dateString : new String[]{
+ "May 15, 2016",
+ "Sun, 15 May 2016",
+ "15 May 2016",
+ }) {
+ testDate(dateString, expected);
+ }
+ }
+
+ @Test
+ public void testTrickyDates() throws Exception {
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
+ //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
+ Date date1980 = df.parse("1980-01-01");
+ for (String dateString : new String[] {
+ "Mon, 29 Jan 96 14:02 GMT",
+ "7/20/95 1:12pm",
+ "08/14/2000 12:48 AM",
+ "06/24/2008, Tuesday, 11 AM",
+ "11/14/08",
+ "12/02/1996",
+ "96/12/02",
+ }) {
+ Date parsedDate = getDate(dateString);
+ if (parsedDate != null) {
+ assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime());
+ }
+ }
+ //TODO: mime4j misparses these to pre 1980 dates
+ //"Wed, 27 Dec 95 11:20:40 EST",
+ //"26 Aug 00 11:14:52 EDT"
+ //
+ //We are still misparsing: 8/1/03 to a pre 1980 date
+
+ }
+
+ private void testDate(String dateString, String expected) throws Exception {
+ Date parsedDate = getDate(dateString);
+ assertNotNull("couldn't parse " + dateString, parsedDate);
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'",
+ new DateFormatSymbols(Locale.US));
+ String parsedDateString = df.format(parsedDate);
+ assertEquals("failed to match: "+dateString, expected, parsedDateString);
+ }
+
+ private Date getDate(String dateString) throws Exception {
+ String mail = "From: dev@tika.apache.org\n"+
+ "Date: "+dateString+"\n";
+ Parser p = new RFC822Parser();
+ Metadata m = new Metadata();
+ try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
+ p.parse(is, new DefaultHandler(), m, new ParseContext());
+ }
+ return m.getDate(TikaCoreProperties.CREATED);
+ }
+
+ @Test
+ public void testMultipleSubjects() throws Exception {
+ //adapted from govdocs1 303710.txt
+ String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" +
+ "Subject: 2006N-3502\n" +
+ "Subject: I Urge You to Require Notice of Mercury";
+ Parser p = new RFC822Parser();
+ Metadata m = new Metadata();
+ p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext());
+ assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
+ }
+}
[24/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index 6f13a54..9ca3595 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -1,913 +1,913 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.math.BigInteger;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
-import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * Decompresses a chm block. Depending on chm block type chooses most relevant
- * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
- * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
- * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
- * Currently relying on previous chm block these types changing according to the
- * previous chm block type. We need to invent more appropriate way to handle
- * such types.
- *
- */
-public class ChmLzxBlock {
- private int block_number;
- private long block_length;
- private ChmLzxState state;
- private byte[] content = null;
- private ChmSection chmSection = null;
- private int contentLength = 0;
-
- // trying to find solution for bad blocks ...
- private int previousBlockType = -1;
-
- public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
- ChmLzxBlock prevBlock) throws TikaException {
- try {
- if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
- setBlockNumber(blockNumber);
-
- if (prevBlock != null
- && prevBlock.getState().getBlockLength() > prevBlock
- .getState().getBlockRemaining())
- setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
- else
- setChmSection(new ChmSection(dataSegment));
-
- setBlockLength(blockLength);
-
- // ============================================
- // we need to take care of previous context
- // ============================================
- checkLzxBlock(prevBlock);
- if (prevBlock == null
- || blockLength < (int) getBlockLength()) {
- setContent((int) getBlockLength());
- }
- else {
- setContent((int) blockLength);
- }
-
- if (prevBlock != null && prevBlock.getState() != null)
- previousBlockType = prevBlock.getState().getBlockType();
-
- extractContent();
- } else
- throw new TikaException("Check your chm lzx block parameters");
- } catch (TikaException e) {
- throw e;
- }
- }
-
- protected int getContentLength() {
- return contentLength;
- }
-
- protected void setContentLength(int contentLength) {
- this.contentLength = contentLength;
- }
-
- private ChmSection getChmSection() {
- return chmSection;
- }
-
- private void setChmSection(ChmSection chmSection) {
- this.chmSection = chmSection;
- }
-
- private void assertStateNotNull() throws TikaException {
- if (getState() == null)
- throw new ChmParsingException("state is null");
- }
-
- private void extractContent() throws TikaException {
- assertStateNotNull();
- if (getChmSection().getData() != null) {
- boolean continueLoop = true;
- while (continueLoop && getContentLength() < getBlockLength()) {
- if (getState() != null && getState().getBlockRemaining() == 0) {
- if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
- getState().setHadStarted(LzxState.STARTED_DECODING);
- if (getChmSection().getSyncBits(1) == 1) {
- int intelSizeTemp = (getChmSection()
- .getSyncBits(16) << 16)
- + getChmSection().getSyncBits(16);
- if (intelSizeTemp >= 0)
- getState().setIntelFileSize(intelSizeTemp);
- else
- getState().setIntelFileSize(0);
- }
- }
- getState().setBlockType(getChmSection().getSyncBits(3));
- getState().setBlockLength(
- (getChmSection().getSyncBits(16) << 8)
- + getChmSection().getSyncBits(8));
- getState().setBlockRemaining(getState().getBlockLength());
-
- // ----------------------------------------
- // Trying to handle 3 - 7 block types
- // ----------------------------------------
- if (getState().getBlockType() > 3) {
- if (previousBlockType >= 0 && previousBlockType < 3)
- getState().setBlockType(previousBlockType);
- }
-
- switch (getState().getBlockType()) {
- case ChmCommons.ALIGNED_OFFSET:
- createAlignedTreeTable();
- //fall through
- case ChmCommons.VERBATIM:
- /* Creates mainTreeTable */
- createMainTreeTable();
- createLengthTreeTable();
- if (getState().getMainTreeLengtsTable()[0xe8] != 0)
- getState().setIntelState(IntelState.STARTED);
- break;
- case ChmCommons.UNCOMPRESSED:
- getState().setIntelState(IntelState.STARTED);
- if (getChmSection().getTotal() > 16)
- getChmSection().setSwath(
- getChmSection().getSwath() - 1);
- getState().setR0(
- (new BigInteger(getChmSection()
- .reverseByteOrder(
- getChmSection().unmarshalBytes(
- 4))).longValue()));
- getState().setR1(
- (new BigInteger(getChmSection()
- .reverseByteOrder(
- getChmSection().unmarshalBytes(
- 4))).longValue()));
- getState().setR2(
- (new BigInteger(getChmSection()
- .reverseByteOrder(
- getChmSection().unmarshalBytes(
- 4))).longValue()));
- break;
- default:
- break;
- }
- } //end of if BlockRemaining == 0
-
- int tempLen;
-
- if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
- getState().setBlockRemaining(
- getContentLength() + getState().getBlockRemaining()
- - (int) getBlockLength());
- tempLen = (int) getBlockLength();
- } else {
- tempLen = getContentLength()
- + getState().getBlockRemaining();
- getState().setBlockRemaining(0);
- }
-
- int lastLength = getContentLength();
- switch (getState().getBlockType()) {
- case ChmCommons.ALIGNED_OFFSET:
- // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
- decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
- break;
- case ChmCommons.VERBATIM:
- decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
- break;
- case ChmCommons.UNCOMPRESSED:
- decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
- break;
- }
- getState().increaseFramesRead();
- if ((getState().getFramesRead() < 32768)
- && getState().getIntelFileSize() != 0)
- intelE8Decoding();
-
- continueLoop = getContentLength() > lastLength;
- }
- }
- }
-
- protected void intelE8Decoding() {
- if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
- || (getState().getIntelState() == IntelState.NOT_STARTED)) {
- getState().setBlockRemaining(
- getState().getBlockRemaining() - (int) getBlockLength());
- } else {
- long curpos = getState().getBlockRemaining();
- getState().setBlockRemaining(
- getState().getBlockRemaining() - (int) getBlockLength());
- int i = 0;
- while (i < getBlockLength() - 10) {
- if (content[i] != 0xe8) {
- i++;
- continue;
- }
- byte[] b = new byte[4];
- b[0] = getContent()[i + 3];
- b[1] = getContent()[i + 2];
- b[2] = getContent()[i + 1];
- b[3] = getContent()[i + 0];
- long absoff = (new BigInteger(b)).longValue();
- if ((absoff >= -curpos)
- && (absoff < getState().getIntelFileSize())) {
- long reloff = (absoff >= 0) ? absoff - curpos : absoff
- + getState().getIntelFileSize();
- getContent()[i + 0] = (byte) reloff;
- getContent()[i + 1] = (byte) (reloff >>> 8);
- getContent()[i + 2] = (byte) (reloff >>> 16);
- getContent()[i + 3] = (byte) (reloff >>> 24);
- }
- i += 4;
- curpos += 5;
- }
- }
- }
-
- private short[] createPreLenTable() {
- short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
- for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
- tmp[i] = (short) getChmSection().getSyncBits(
- ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
- }
- return tmp;
- }
-
- private void createLengthTreeTable() throws TikaException {
- //Read Pre Tree Table
- short[] prelentable = createPreLenTable();
-
- if (prelentable == null) {
- throw new ChmParsingException("pretreetable is null");
- }
-
- short[] pretreetable = createTreeTable2(prelentable,
- (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
- + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
- ChmConstants.LZX_PRETREE_TABLEBITS,
- ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
- if (pretreetable == null) {
- throw new ChmParsingException("pretreetable is null");
- }
-
- //Build Length Tree
- createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
- pretreetable, prelentable);
-
- getState().setLengthTreeTable(
- createTreeTable2(getState().getLengthTreeLengtsTable(),
- (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
- + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
- ChmConstants.LZX_LENGTH_TABLEBITS,
- ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
- }
-
- private void decompressUncompressedBlock(int len, byte[] prevcontent) {
- if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
- for (int i = getContentLength(); i < (getContentLength() + getState()
- .getBlockRemaining()); i++)
- content[i] = getChmSection().getByte();
-
- setContentLength(getContentLength()
- + getState().getBlockRemaining());
- getState().setBlockRemaining(0);
- } else {
- for (int i = getContentLength(); i < getBlockLength(); i++)
- content[i] = getChmSection().getByte();
- getState().setBlockRemaining(
- (int) getBlockLength() - getContentLength());// = blockLen -
- // contentlen;
- setContentLength((int) getBlockLength());
- }
- }
-
- private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
-
- if ((getChmSection() == null) || (getState() == null)
- || (getState().getMainTreeTable() == null))
- throw new ChmParsingException("chm section is null");
-
- short s;
- int x, i, border;
- int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
- int matchoffset = 0;
- for (i = getContentLength(); i < len; i++) {
- /* new code */
- //read huffman tree from main tree
- border = getChmSection().peekBits(
- ChmConstants.LZX_MAINTREE_TABLEBITS);
- if (border >= getState().mainTreeTable.length)
- throw new ChmParsingException("error decompressing aligned block.");
- //break;
- /* end new code */
- s = getState().mainTreeTable[getChmSection().peekBits(
- ChmConstants.LZX_MAINTREE_TABLEBITS)];
- if (s >= getState().getMainTreeElements()) {
- x = ChmConstants.LZX_MAINTREE_TABLEBITS;
- do {
- x++;
- s <<= 1;
- s += getChmSection().checkBit(x);
- } while ((s = getState().mainTreeTable[s]) >= getState()
- .getMainTreeElements());
- }
- //System.out.printf("%d,", s);
- //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
- getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
- if (s < ChmConstants.LZX_NUM_CHARS) {
- content[i] = (byte) s;
- } else {
- s -= ChmConstants.LZX_NUM_CHARS;
- matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
- if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
- matchfooter = getState().lengthTreeTable[getChmSection()
- .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
- if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
- x = ChmConstants.LZX_LENGTH_TABLEBITS;
- do {
- x++;
- matchfooter <<= 1;
- matchfooter += getChmSection().checkBit(x);
- } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
- }
- getChmSection().getSyncBits(
- getState().lengthTreeLengtsTable[matchfooter]);
- matchlen += matchfooter;
- }
- matchlen += ChmConstants.LZX_MIN_MATCH;
- matchoffset = s >>> 3;
- if (matchoffset > 2) {
- extra = ChmConstants.EXTRA_BITS[matchoffset];
- matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
- if (extra > 3) {
- extra -= 3;
- long verbatim_bits = getChmSection().getSyncBits(extra);
- matchoffset += (verbatim_bits << 3);
- //READ HUFF SYM in Aligned Tree
- int aligned_bits = getChmSection().peekBits(
- ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
- int t = getState().getAlignedTreeTable()[aligned_bits];
- if (t >= getState().getMainTreeElements()) {
- x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
- do {
- x++;
- t <<= 1;
- t += getChmSection().checkBit(x);
- } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
- .getMainTreeElements());
- }
- getChmSection().getSyncBits(
- getState().getAlignedLenTable()[t]);
- matchoffset += t;
- } else if (extra == 3) {
- int g = getChmSection().peekBits(
- ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
- int t = getState().getAlignedTreeTable()[g];
- if (t >= getState().getMainTreeElements()) {
- x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
- do {
- x++;
- t <<= 1;
- t += getChmSection().checkBit(x);
- } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
- .getMainTreeElements());
- }
- getChmSection().getSyncBits(
- getState().getAlignedLenTable()[t]);
- matchoffset += t;
- } else if (extra > 0) {
- long l = getChmSection().getSyncBits(extra);
- matchoffset += l;
- } else
- matchoffset = 1;
- getState().setR2(getState().getR1());
- getState().setR1(getState().getR0());
- getState().setR0(matchoffset);
- } else if (matchoffset == 0) {
- matchoffset = (int) getState().getR0();
- } else if (matchoffset == 1) {
- matchoffset = (int) getState().getR1();
- getState().setR1(getState().getR0());
- getState().setR0(matchoffset);
- } else /** match_offset == 2 */
- {
- matchoffset = (int) getState().getR2();
- getState().setR2(getState().getR0());
- getState().setR0(matchoffset);
- }
- rundest = i;
- runsrc = rundest - matchoffset;
- i += (matchlen - 1);
- if (i > len)
- break;
-
- if (runsrc < 0) {
- if (matchlen + runsrc <= 0) {
- runsrc = prevcontent.length + runsrc;
- while (matchlen-- > 0)
- content[rundest++] = prevcontent[runsrc++];
- } else {
- runsrc = prevcontent.length + runsrc;
- while (runsrc < prevcontent.length)
- content[rundest++] = prevcontent[runsrc++];
- matchlen = matchlen + runsrc - prevcontent.length;
- runsrc = 0;
- while (matchlen-- > 0)
- content[rundest++] = content[runsrc++];
- }
-
- } else {
- /* copies any wrappes around source data */
- while ((runsrc < 0) && (matchlen-- > 0)) {
- content[rundest++] = content[(int) (runsrc + getBlockLength())];
- runsrc++;
- }
- /* copies match data - no worries about destination wraps */
- while (matchlen-- > 0)
- content[rundest++] = content[runsrc++];
- }
- }
- }
- setContentLength(len);
- }
-
- private void assertShortArrayNotNull(short[] array) throws TikaException {
- if (array == null)
- throw new ChmParsingException("short[] is null");
- }
-
- private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
- short s;
- int x, i;
- int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
- int matchoffset = 0;
- for (i = getContentLength(); i < len; i++) {
- int f = getChmSection().peekBits(
- ChmConstants.LZX_MAINTREE_TABLEBITS);
- assertShortArrayNotNull(getState().getMainTreeTable());
- s = getState().getMainTreeTable()[f];
- if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
- x = ChmConstants.LZX_MAINTREE_TABLEBITS;
- do {
- x++;
- s <<= 1;
- s += getChmSection().checkBit(x);
- } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
- }
- getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
- if (s < ChmConstants.LZX_NUM_CHARS) {
- content[i] = (byte) s;
- } else {
- s -= ChmConstants.LZX_NUM_CHARS;
- matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
- if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
- matchfooter = getState().getLengthTreeTable()[getChmSection()
- .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
- if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
- x = ChmConstants.LZX_LENGTH_TABLEBITS;
- do {
- x++;
- matchfooter <<= 1;
- matchfooter += getChmSection().checkBit(x);
- } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
- }
- getChmSection().getSyncBits(
- getState().getLengthTreeLengtsTable()[matchfooter]);
- matchlen += matchfooter;
- }
- matchlen += ChmConstants.LZX_MIN_MATCH;
- // shorter than 2
- matchoffset = s >>> 3;
- if (matchoffset > 2) {
- if (matchoffset != 3) { // should get other bits to retrieve
- // offset
- extra = ChmConstants.EXTRA_BITS[matchoffset];
- long l = getChmSection().getSyncBits(extra);
- matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
- } else {
- matchoffset = 1;
- }
- getState().setR2(getState().getR1());
- getState().setR1(getState().getR0());
- getState().setR0(matchoffset);
- } else if (matchoffset == 0) {
- matchoffset = (int) getState().getR0();
- } else if (matchoffset == 1) {
- matchoffset = (int) getState().getR1();
- getState().setR1(getState().getR0());
- getState().setR0(matchoffset);
- } else /* match_offset == 2 */
- {
- matchoffset = (int) getState().getR2();
- getState().setR2(getState().getR0());
- getState().setR0(matchoffset);
- }
- rundest = i;
- runsrc = rundest - matchoffset;
- i += (matchlen - 1);
- if (i > len)
- break;
- if (runsrc < 0) {
- if (matchlen + runsrc <= 0) {
- runsrc = prevcontent.length + runsrc;
- while ((matchlen-- > 0) && (prevcontent != null)
- && ((runsrc + 1) > 0))
- if ((rundest < content.length)
- && (runsrc < content.length))
- content[rundest++] = prevcontent[runsrc++];
- } else {
- runsrc = prevcontent.length + runsrc;
- while (runsrc < prevcontent.length)
- if ((rundest < content.length)
- && (runsrc < content.length))
- content[rundest++] = prevcontent[runsrc++];
- matchlen = matchlen + runsrc - prevcontent.length;
- runsrc = 0;
- while (matchlen-- > 0)
- content[rundest++] = content[runsrc++];
- }
-
- } else {
- /* copies any wrapped source data */
- while ((runsrc < 0) && (matchlen-- > 0)) {
- content[rundest++] = content[(int) (runsrc + getBlockLength())];
- runsrc++;
- }
- /* copies match data - no worries about destination wraps */
- while (matchlen-- > 0) {
- if ((rundest < content.length)
- && (runsrc < content.length))
- content[rundest++] = content[runsrc++];
- }
- }
- }
- }
- setContentLength(len);
- }
-
- private void createLengthTreeLenTable(int offset, int tablelen,
- short[] pretreetable, short[] prelentable) throws TikaException {
- if (prelentable == null || getChmSection() == null
- || pretreetable == null || prelentable == null)
- throw new ChmParsingException("is null");
-
- int i = offset; // represents offset
- int z, y, x;// local counters
- while (i < tablelen) {
- //Read HUFF sym to z
- z = pretreetable[getChmSection().peekBits(
- ChmConstants.LZX_PRETREE_TABLEBITS)];
- if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
- // 20
- x = ChmConstants.LZX_PRETREE_TABLEBITS;
- do {
- x++;
- z <<= 1;
- z += getChmSection().checkBit(x);
- } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
- }
- getChmSection().getSyncBits(prelentable[z]);
-
- if (z < 17) {
- z = getState().getLengthTreeLengtsTable()[i] - z;
- if (z < 0)
- z = z + 17;
- getState().getLengthTreeLengtsTable()[i] = (short) z;
- i++;
- } else if (z == 17) {
- y = getChmSection().getSyncBits(4);
- y += 4;
- for (int j = 0; j < y; j++)
- if (i < getState().getLengthTreeLengtsTable().length)
- getState().getLengthTreeLengtsTable()[i++] = 0;
- } else if (z == 18) {
- y = getChmSection().getSyncBits(5);
- y += 20;
- for (int j = 0; j < y; j++)
- //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
- getState().getLengthTreeLengtsTable()[i++] = 0;
- } else if (z == 19) {
- y = getChmSection().getSyncBits(1);
- y += 4;
- z = pretreetable[getChmSection().peekBits(
- ChmConstants.LZX_PRETREE_TABLEBITS)];
- if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
- x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
- do {
- x++;
- z <<= 1;
- z += getChmSection().checkBit(x);
- } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
- }
- getChmSection().getSyncBits(prelentable[z]);
- z = getState().getLengthTreeLengtsTable()[i] - z;
- if (z < 0)
- z = z + 17;
- for (int j = 0; j < y; j++)
- getState().getLengthTreeLengtsTable()[i++] = (short) z;
- }
- }
- }
-
- private void createMainTreeTable() throws TikaException {
- //Read Pre Tree Table
- short[] prelentable = createPreLenTable();
- short[] pretreetable = createTreeTable2(prelentable,
- (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
- + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
- ChmConstants.LZX_PRETREE_TABLEBITS,
- ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
- createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
- prelentable);
-
- //Read Pre Tree Table
- prelentable = createPreLenTable();
- pretreetable = createTreeTable2(prelentable,
- (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
- + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
- ChmConstants.LZX_PRETREE_TABLEBITS,
- ChmConstants.LZX_PRETREE_MAXSYMBOLS);
-
- createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
- getState().mainTreeLengtsTable.length, pretreetable,
- prelentable);
-
- getState().setMainTreeTable(
- createTreeTable2(getState().mainTreeLengtsTable,
- (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
- + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
- ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
- .getMainTreeElements()));
- }
-
- private void createMainTreeLenTable(int offset, int tablelen,
- short[] pretreetable, short[] prelentable) throws TikaException {
- if (pretreetable == null)
- throw new ChmParsingException("pretreetable is null");
- int i = offset;
- int z, y, x;
- while (i < tablelen) {
- int f = getChmSection().peekBits(
- ChmConstants.LZX_PRETREE_TABLEBITS);
- z = pretreetable[f];
- if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
- x = ChmConstants.LZX_PRETREE_TABLEBITS;
- do {
- x++;
- z <<= 1;
- z += getChmSection().checkBit(x);
- } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
- }
- getChmSection().getSyncBits(prelentable[z]);
- if (z < 17) {
- z = getState().getMainTreeLengtsTable()[i] - z;
- if (z < 0)
- z = z + 17;
- getState().mainTreeLengtsTable[i] = (short) z;
- i++;
- } else if (z == 17) {
- y = getChmSection().getSyncBits(4);
- y += 4;
- for (int j = 0; j < y; j++) {
- assertInRange(getState().getMainTreeLengtsTable(), i);
- getState().mainTreeLengtsTable[i++] = 0;
- }
- } else if (z == 18) {
- y = getChmSection().getSyncBits(5);
- y += 20;
- for (int j = 0; j < y; j++) {
- assertInRange(getState().getMainTreeLengtsTable(), i);
- getState().mainTreeLengtsTable[i++] = 0;
- }
- } else if (z == 19) {
- y = getChmSection().getSyncBits(1);
- y += 4;
- z = pretreetable[getChmSection().peekBits(
- ChmConstants.LZX_PRETREE_TABLEBITS)];
- if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
- x = ChmConstants.LZX_PRETREE_TABLEBITS;
- do {
- x++;
- z <<= 1;
- z += getChmSection().checkBit(x);
- } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
- }
- getChmSection().getSyncBits(prelentable[z]);
- z = getState().mainTreeLengtsTable[i] - z;
- if (z < 0)
- z = z + 17;
- for (int j = 0; j < y; j++)
- if (i < getState().getMainTreeLengtsTable().length)
- getState().mainTreeLengtsTable[i++] = (short) z;
- }
- }
- }
-
- private void assertInRange(short[] array, int index) throws ChmParsingException {
- if (index >= array.length)
- throw new ChmParsingException(index + " is bigger than "
- + array.length);
- }
-
- private short[] createAlignedLenTable() {
- int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
- int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
- short[] tmp = new short[tablelen];
- for (int i = 0; i < tablelen; i++) {
- tmp[i] = (short) getChmSection().getSyncBits(bits);
- }
- return tmp;
- }
-
- private void createAlignedTreeTable() throws ChmParsingException {
- getState().setAlignedLenTable(createAlignedLenTable());
- getState().setAlignedTreeTable(//setAlignedLenTable(
- createTreeTable2(getState().getAlignedLenTable(),
- (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
- + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
- ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
- ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
- }
-
- private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
- int maxsymbol) throws ChmParsingException {
- short[] tmp = new short[tablelen];
- short sym;
- int leaf;
- int bit_num = 1;
- long fill;
- int pos = 0;
- /* the current position in the decode table */
- long table_mask = (1 << bits);
- long bit_mask = (table_mask >> 1);
- long next_symbol = bit_mask;
-
- /* fills entries for short codes for a direct mapping */
- while (bit_num <= bits) {
- for (sym = 0; sym < maxsymbol; sym++) {
- if (lentable.length > sym && lentable[sym] == bit_num) {
- leaf = pos;
-
- if ((pos += bit_mask) > table_mask) {
- /* table overflow */
- throw new ChmParsingException("Table overflow");
- }
-
- fill = bit_mask;
- while (fill-- > 0)
- tmp[leaf++] = sym;
- }
- }
- bit_mask >>= 1;
- bit_num++;
- }
-
- /* if there are any codes longer than nbits */
- if (pos != table_mask) {
- /* clears the remainder of the table */
- for (leaf = pos; leaf < table_mask; leaf++)
- tmp[leaf] = 0;
-
- /* gives ourselves room for codes to grow by up to 16 more bits */
- pos <<= 16;
- table_mask <<= 16;
- bit_mask = 1 << 15;
-
- while (bit_num <= 16) {
- for (sym = 0; sym < maxsymbol; sym++) {
- if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
- leaf = pos >> 16;
- for (fill = 0; fill < bit_num - bits; fill++) {
- /*
- * if this path hasn't been taken yet, 'allocate'
- * two entries
- */
- if (tmp[leaf] == 0) {
- if (((next_symbol << 1) + 1) < tmp.length) {
- tmp[(int) (next_symbol << 1)] = 0;
- tmp[(int) (next_symbol << 1) + 1] = 0;
- tmp[leaf] = (short) next_symbol++;
- }
-
- }
- /*
- * follows the path and select either left or right
- * for next bit
- */
- leaf = tmp[leaf] << 1;
- if (((pos >> (15 - fill)) & 1) != 0)
- leaf++;
- }
- tmp[leaf] = sym;
-
- if ((pos += bit_mask) > table_mask) {
- /* table overflow */
- throw new ChmParsingException("Table overflow");
- }
- }
- }
- bit_mask >>= 1;
- bit_num++;
- }
- }
-
- /* is it full table? */
- if (pos == table_mask)
- return tmp;
-
- return tmp;
- }
-
- public byte[] getContent() {
- return content;
- }
-
- public byte[] getContent(int startOffset, int endOffset) {
- return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
- startOffset, endOffset) : new byte[1];
- }
-
- public byte[] getContent(int start) {
- return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
- start, getContent().length) : new byte[1];
- }
-
- private void setContent(int contentLength) {
- this.content = new byte[contentLength];
- }
-
- private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
- if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
- setState(new ChmLzxState((int) getBlockLength()));
- else
- //use clone to avoid changing a cached or to be cached block
- setState(chmPrevLzxBlock.getState().clone());
- }
-
- private boolean validateConstructorParams(int blockNumber,
- byte[] dataSegment, long blockLength) throws TikaException {
- int goodParameter = 0;
- if (blockNumber >= 0)
- ++goodParameter;
- else
- throw new ChmParsingException("block number should be possitive");
- if (dataSegment != null && dataSegment.length > 0)
- ++goodParameter;
- else
- throw new ChmParsingException("data segment should not be null");
- if (blockLength > 0)
- ++goodParameter;
- else
- throw new ChmParsingException(
- "block length should be more than zero");
- return (goodParameter == 3);
- }
-
- public int getBlockNumber() {
- return block_number;
- }
-
- private void setBlockNumber(int block_number) {
- this.block_number = block_number;
- }
-
- private long getBlockLength() {
- return block_length;
- }
-
- private void setBlockLength(long block_length) {
- this.block_length = block_length;
- }
-
- public ChmLzxState getState() {
- return state;
- }
-
- private void setState(ChmLzxState state) {
- this.state = state;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ *
+ */
+public class ChmLzxBlock {
+ private int block_number;
+ private long block_length;
+ private ChmLzxState state;
+ private byte[] content = null;
+ private ChmSection chmSection = null;
+ private int contentLength = 0;
+
+ // trying to find solution for bad blocks ...
+ private int previousBlockType = -1;
+
+ public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+ ChmLzxBlock prevBlock) throws TikaException {
+ try {
+ if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+ setBlockNumber(blockNumber);
+
+ if (prevBlock != null
+ && prevBlock.getState().getBlockLength() > prevBlock
+ .getState().getBlockRemaining())
+ setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
+ else
+ setChmSection(new ChmSection(dataSegment));
+
+ setBlockLength(blockLength);
+
+ // ============================================
+ // we need to take care of previous context
+ // ============================================
+ checkLzxBlock(prevBlock);
+ if (prevBlock == null
+ || blockLength < (int) getBlockLength()) {
+ setContent((int) getBlockLength());
+ }
+ else {
+ setContent((int) blockLength);
+ }
+
+ if (prevBlock != null && prevBlock.getState() != null)
+ previousBlockType = prevBlock.getState().getBlockType();
+
+ extractContent();
+ } else
+ throw new TikaException("Check your chm lzx block parameters");
+ } catch (TikaException e) {
+ throw e;
+ }
+ }
+
+ protected int getContentLength() {
+ return contentLength;
+ }
+
+ protected void setContentLength(int contentLength) {
+ this.contentLength = contentLength;
+ }
+
+ private ChmSection getChmSection() {
+ return chmSection;
+ }
+
+ private void setChmSection(ChmSection chmSection) {
+ this.chmSection = chmSection;
+ }
+
+ private void assertStateNotNull() throws TikaException {
+ if (getState() == null)
+ throw new ChmParsingException("state is null");
+ }
+
+ private void extractContent() throws TikaException {
+ assertStateNotNull();
+ if (getChmSection().getData() != null) {
+ boolean continueLoop = true;
+ while (continueLoop && getContentLength() < getBlockLength()) {
+ if (getState() != null && getState().getBlockRemaining() == 0) {
+ if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+ getState().setHadStarted(LzxState.STARTED_DECODING);
+ if (getChmSection().getSyncBits(1) == 1) {
+ int intelSizeTemp = (getChmSection()
+ .getSyncBits(16) << 16)
+ + getChmSection().getSyncBits(16);
+ if (intelSizeTemp >= 0)
+ getState().setIntelFileSize(intelSizeTemp);
+ else
+ getState().setIntelFileSize(0);
+ }
+ }
+ getState().setBlockType(getChmSection().getSyncBits(3));
+ getState().setBlockLength(
+ (getChmSection().getSyncBits(16) << 8)
+ + getChmSection().getSyncBits(8));
+ getState().setBlockRemaining(getState().getBlockLength());
+
+ // ----------------------------------------
+ // Trying to handle 3 - 7 block types
+ // ----------------------------------------
+ if (getState().getBlockType() > 3) {
+ if (previousBlockType >= 0 && previousBlockType < 3)
+ getState().setBlockType(previousBlockType);
+ }
+
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ createAlignedTreeTable();
+ //fall through
+ case ChmCommons.VERBATIM:
+ /* Creates mainTreeTable */
+ createMainTreeTable();
+ createLengthTreeTable();
+ if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+ getState().setIntelState(IntelState.STARTED);
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ getState().setIntelState(IntelState.STARTED);
+ if (getChmSection().getTotal() > 16)
+ getChmSection().setSwath(
+ getChmSection().getSwath() - 1);
+ getState().setR0(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR1(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR2(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ break;
+ default:
+ break;
+ }
+ } //end of if BlockRemaining == 0
+
+ int tempLen;
+
+ if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+ getState().setBlockRemaining(
+ getContentLength() + getState().getBlockRemaining()
+ - (int) getBlockLength());
+ tempLen = (int) getBlockLength();
+ } else {
+ tempLen = getContentLength()
+ + getState().getBlockRemaining();
+ getState().setBlockRemaining(0);
+ }
+
+ int lastLength = getContentLength();
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+ decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
+ break;
+ case ChmCommons.VERBATIM:
+ decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ }
+ getState().increaseFramesRead();
+ if ((getState().getFramesRead() < 32768)
+ && getState().getIntelFileSize() != 0)
+ intelE8Decoding();
+
+ continueLoop = getContentLength() > lastLength;
+ }
+ }
+ }
+
+ protected void intelE8Decoding() {
+ if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+ || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ } else {
+ long curpos = getState().getBlockRemaining();
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ int i = 0;
+ while (i < getBlockLength() - 10) {
+ if (content[i] != 0xe8) {
+ i++;
+ continue;
+ }
+ byte[] b = new byte[4];
+ b[0] = getContent()[i + 3];
+ b[1] = getContent()[i + 2];
+ b[2] = getContent()[i + 1];
+ b[3] = getContent()[i + 0];
+ long absoff = (new BigInteger(b)).longValue();
+ if ((absoff >= -curpos)
+ && (absoff < getState().getIntelFileSize())) {
+ long reloff = (absoff >= 0) ? absoff - curpos : absoff
+ + getState().getIntelFileSize();
+ getContent()[i + 0] = (byte) reloff;
+ getContent()[i + 1] = (byte) (reloff >>> 8);
+ getContent()[i + 2] = (byte) (reloff >>> 16);
+ getContent()[i + 3] = (byte) (reloff >>> 24);
+ }
+ i += 4;
+ curpos += 5;
+ }
+ }
+ }
+
+ private short[] createPreLenTable() {
+ short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+ for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(
+ ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+ }
+ return tmp;
+ }
+
+ private void createLengthTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+
+ if (prelentable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ if (pretreetable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ //Build Length Tree
+ createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+ pretreetable, prelentable);
+
+ getState().setLengthTreeTable(
+ createTreeTable2(getState().getLengthTreeLengtsTable(),
+ (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+ + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+ ChmConstants.LZX_LENGTH_TABLEBITS,
+ ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+ }
+
+ private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+ if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+ for (int i = getContentLength(); i < (getContentLength() + getState()
+ .getBlockRemaining()); i++)
+ content[i] = getChmSection().getByte();
+
+ setContentLength(getContentLength()
+ + getState().getBlockRemaining());
+ getState().setBlockRemaining(0);
+ } else {
+ for (int i = getContentLength(); i < getBlockLength(); i++)
+ content[i] = getChmSection().getByte();
+ getState().setBlockRemaining(
+ (int) getBlockLength() - getContentLength());// = blockLen -
+ // contentlen;
+ setContentLength((int) getBlockLength());
+ }
+ }
+
+ private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
+
+ if ((getChmSection() == null) || (getState() == null)
+ || (getState().getMainTreeTable() == null))
+ throw new ChmParsingException("chm section is null");
+
+ short s;
+ int x, i, border;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ /* new code */
+ //read huffman tree from main tree
+ border = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ if (border >= getState().mainTreeTable.length)
+ throw new ChmParsingException("error decompressing aligned block.");
+ //break;
+ /* end new code */
+ s = getState().mainTreeTable[getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS)];
+ if (s >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().mainTreeTable[s]) >= getState()
+ .getMainTreeElements());
+ }
+ //System.out.printf("%d,", s);
+ //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().lengthTreeTable[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().lengthTreeLengtsTable[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+ if (extra > 3) {
+ extra -= 3;
+ long verbatim_bits = getChmSection().getSyncBits(extra);
+ matchoffset += (verbatim_bits << 3);
+ //READ HUFF SYM in Aligned Tree
+ int aligned_bits = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[aligned_bits];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra == 3) {
+ int g = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[g];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra > 0) {
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset += l;
+ } else
+ matchoffset = 1;
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /** match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while (matchlen-- > 0)
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrappes around source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void assertShortArrayNotNull(short[] array) throws TikaException {
+ if (array == null)
+ throw new ChmParsingException("short[] is null");
+ }
+
+ private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
+ short s;
+ int x, i;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ assertShortArrayNotNull(getState().getMainTreeTable());
+ s = getState().getMainTreeTable()[f];
+ if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().getLengthTreeTable()[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().getLengthTreeLengtsTable()[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ // shorter than 2
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ if (matchoffset != 3) { // should get other bits to retrieve
+ // offset
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+ } else {
+ matchoffset = 1;
+ }
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /* match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while ((matchlen-- > 0) && (prevcontent != null)
+ && ((runsrc + 1) > 0))
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrapped source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0) {
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void createLengthTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (prelentable == null || getChmSection() == null
+ || pretreetable == null || prelentable == null)
+ throw new ChmParsingException("is null");
+
+ int i = offset; // represents offset
+ int z, y, x;// local counters
+ while (i < tablelen) {
+ //Read HUFF sym to z
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+ // 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+
+ if (z < 17) {
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().getLengthTreeLengtsTable()[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++)
+ //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ getState().getLengthTreeLengtsTable()[i++] = (short) z;
+ }
+ }
+ }
+
+ private void createMainTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+ prelentable);
+
+ //Read Pre Tree Table
+ prelentable = createPreLenTable();
+ pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+ getState().mainTreeLengtsTable.length, pretreetable,
+ prelentable);
+
+ getState().setMainTreeTable(
+ createTreeTable2(getState().mainTreeLengtsTable,
+ (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+ + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+ .getMainTreeElements()));
+ }
+
+ private void createMainTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (pretreetable == null)
+ throw new ChmParsingException("pretreetable is null");
+ int i = offset;
+ int z, y, x;
+ while (i < tablelen) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS);
+ z = pretreetable[f];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ if (z < 17) {
+ z = getState().getMainTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().mainTreeLengtsTable[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().mainTreeLengtsTable[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getMainTreeLengtsTable().length)
+ getState().mainTreeLengtsTable[i++] = (short) z;
+ }
+ }
+ }
+
+ private void assertInRange(short[] array, int index) throws ChmParsingException {
+ if (index >= array.length)
+ throw new ChmParsingException(index + " is bigger than "
+ + array.length);
+ }
+
+ private short[] createAlignedLenTable() {
+ int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+ int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+ short[] tmp = new short[tablelen];
+ for (int i = 0; i < tablelen; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(bits);
+ }
+ return tmp;
+ }
+
+ private void createAlignedTreeTable() throws ChmParsingException {
+ getState().setAlignedLenTable(createAlignedLenTable());
+ getState().setAlignedTreeTable(//setAlignedLenTable(
+ createTreeTable2(getState().getAlignedLenTable(),
+ (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+ + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+ ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+ }
+
+ private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+ int maxsymbol) throws ChmParsingException {
+ short[] tmp = new short[tablelen];
+ short sym;
+ int leaf;
+ int bit_num = 1;
+ long fill;
+ int pos = 0;
+ /* the current position in the decode table */
+ long table_mask = (1 << bits);
+ long bit_mask = (table_mask >> 1);
+ long next_symbol = bit_mask;
+
+ /* fills entries for short codes for a direct mapping */
+ while (bit_num <= bits) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if (lentable.length > sym && lentable[sym] == bit_num) {
+ leaf = pos;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+
+ fill = bit_mask;
+ while (fill-- > 0)
+ tmp[leaf++] = sym;
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+
+ /* if there are any codes longer than nbits */
+ if (pos != table_mask) {
+ /* clears the remainder of the table */
+ for (leaf = pos; leaf < table_mask; leaf++)
+ tmp[leaf] = 0;
+
+ /* gives ourselves room for codes to grow by up to 16 more bits */
+ pos <<= 16;
+ table_mask <<= 16;
+ bit_mask = 1 << 15;
+
+ while (bit_num <= 16) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+ leaf = pos >> 16;
+ for (fill = 0; fill < bit_num - bits; fill++) {
+ /*
+ * if this path hasn't been taken yet, 'allocate'
+ * two entries
+ */
+ if (tmp[leaf] == 0) {
+ if (((next_symbol << 1) + 1) < tmp.length) {
+ tmp[(int) (next_symbol << 1)] = 0;
+ tmp[(int) (next_symbol << 1) + 1] = 0;
+ tmp[leaf] = (short) next_symbol++;
+ }
+
+ }
+ /*
+ * follows the path and select either left or right
+ * for next bit
+ */
+ leaf = tmp[leaf] << 1;
+ if (((pos >> (15 - fill)) & 1) != 0)
+ leaf++;
+ }
+ tmp[leaf] = sym;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+ }
+
+ /* is it full table? */
+ if (pos == table_mask)
+ return tmp;
+
+ return tmp;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public byte[] getContent(int startOffset, int endOffset) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ startOffset, endOffset) : new byte[1];
+ }
+
+ public byte[] getContent(int start) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ start, getContent().length) : new byte[1];
+ }
+
+ private void setContent(int contentLength) {
+ this.content = new byte[contentLength];
+ }
+
+ private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
+ if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+ setState(new ChmLzxState((int) getBlockLength()));
+ else
+ //use clone to avoid changing a cached or to be cached block
+ setState(chmPrevLzxBlock.getState().clone());
+ }
+
+ private boolean validateConstructorParams(int blockNumber,
+ byte[] dataSegment, long blockLength) throws TikaException {
+ int goodParameter = 0;
+ if (blockNumber >= 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("block number should be possitive");
+ if (dataSegment != null && dataSegment.length > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("data segment should not be null");
+ if (blockLength > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException(
+ "block length should be more than zero");
+ return (goodParameter == 3);
+ }
+
+ public int getBlockNumber() {
+ return block_number;
+ }
+
+ private void setBlockNumber(int block_number) {
+ this.block_number = block_number;
+ }
+
+ private long getBlockLength() {
+ return block_length;
+ }
+
+ private void setBlockLength(long block_length) {
+ this.block_length = block_length;
+ }
+
+ public ChmLzxState getState() {
+ return state;
+ }
+
+ private void setState(ChmLzxState state) {
+ this.state = state;
+ }
+}
[27/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
index 2319ec8..10b00ae 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
@@ -1,548 +1,548 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Directory header The directory starts with a header; its format is as
- * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
- * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
- * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
- * Depth of the index tree - 1 there is no index, 2 if there is one level of
- * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
- * (though at least one file has 0 despite there being no index chunk, probably
- * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
- * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
- * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
- * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
- * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
- * DWORD -1 (unknown)
- *
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1}
- *
- */
-public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
- // TODO: refactor all unmarshals
- private static final long serialVersionUID = 1962394421998181341L;
- private byte[] signature;
- private int version; /* 4 */
- private int header_len; /* 8 */
- private int unknown_000c; /* c */
- private long block_len; /* 10 */
- private int blockidx_intvl; /* 14 */
- private int index_depth; /* 18 */
- private int index_root; /* 1c */
- private int index_head; /* 20 */
- private int unknown_0024; /* 24 */
- private long num_blocks; /* 28 */
- private int unknown_002c; /* 2c */
- private long lang_id; /* 30 */
- private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
- private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- public ChmItspHeader() {
- signature = ChmConstants.ITSP.getBytes(UTF_8); /*
- * 0
- * (ITSP
- * )
- */
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("[ signature:=" + new String(getSignature(), UTF_8)
- + System.getProperty("line.separator"));
- sb.append("version:=\t" + getVersion()
- + System.getProperty("line.separator"));
- sb.append("header_len:=\t" + getHeader_len()
- + System.getProperty("line.separator"));
- sb.append("unknown_00c:=\t" + getUnknown_000c()
- + System.getProperty("line.separator"));
- sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
- + System.getProperty("line.separator"));
- sb.append("blockidx_intvl:=" + getBlockidx_intvl()
- + ", density of quickref section, usually 2"
- + System.getProperty("line.separator"));
- sb.append("index_depth:=\t"
- + getIndex_depth()
- + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
- + System.getProperty("line.separator"));
- sb.append("index_root:=\t" + getIndex_root()
- + ", chunk number of root index chunk, -1 if there is none"
- + System.getProperty("line.separator"));
- sb.append("index_head:=\t" + getIndex_head()
- + ", chunk number of first PMGL (listing) chunk"
- + System.getProperty("line.separator"));
- sb.append("unknown_0024:=\t" + getUnknown_0024()
- + ", chunk number of last PMGL (listing) chunk"
- + System.getProperty("line.separator"));
- sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
- + System.getProperty("line.separator"));
- sb.append("unknown_002c:=\t" + getUnknown_002c()
- + ", number of directory chunks (total)"
- + System.getProperty("line.separator"));
- sb.append("lang_id:=\t" + getLang_id() + " - "
- + ChmCommons.getLanguage(getLang_id())
- + System.getProperty("line.separator"));
- sb.append("system_uuid:=" + getSystem_uuid()
- + System.getProperty("line.separator"));
- sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
- return sb.toString();
- }
-
- /**
- * Copies 4 bits from data[]
- *
- * @param data
- * @param chmItspHeader
- * @param count
- * @throws TikaException
- */
- private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
- int count) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- ChmAssert.assertChmAccessorNotNull(chmItspHeader);
- this.setDataRemained(data.length);
- System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- }
-
- private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- if (4 > this.getDataRemained())
- throw new TikaException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- this.setDataRemained(this.getDataRemained() - 4);
- return dest;
- }
-
- private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- if (4 > dataLenght)
- throw new TikaException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
- int count) {
- System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- return dest;
- }
-
- /**
- * Returns how many bytes remained
- *
- * @return int
- */
- private int getDataRemained() {
- return dataRemained;
- }
-
- /**
- * Sets how many bytes remained
- *
- * @param dataRemained
- */
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- /**
- * Returns a place holder
- *
- * @return current place
- */
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- /**
- * Sets current place
- *
- * @param currentPlace
- */
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- /**
- * Returns a signature of the header
- *
- * @return itsp signature
- */
- public byte[] getSignature() {
- return signature;
- }
-
- /**
- * Sets itsp signature
- *
- * @param signature
- */
- protected void setSignature(byte[] signature) {
- this.signature = signature;
- }
-
- /**
- * Returns version of itsp header
- *
- * @return version
- */
- public int getVersion() {
- return version;
- }
-
- /**
- * Sets a version of itsp header
- *
- * @param version
- */
- protected void setVersion(int version) {
- this.version = version;
- }
-
- /**
- * Returns header length
- *
- * @return header length
- */
- public int getHeader_len() {
- return header_len;
- }
-
- /**
- * Sets itsp header length
- *
- * @param header_len
- */
- protected void setHeader_len(int header_len) {
- this.header_len = header_len;
- }
-
- /**
- * Returns 000c unknown bytes
- */
- public int getUnknown_000c() {
- return unknown_000c;
- }
-
- /**
- * Sets 000c unknown bytes Unknown means here that those guys who cracked
- * the chm format do not know what's it purposes for
- *
- * @param unknown_000c
- */
- protected void setUnknown_000c(int unknown_000c) {
- this.unknown_000c = unknown_000c;
- }
-
- /**
- * Returns block's length
- *
- * @return block_length
- */
- public long getBlock_len() {
- return block_len;
- }
-
- /**
- * Sets block length
- *
- * @param block_len
- */
- protected void setBlock_len(long block_len) {
- this.block_len = block_len;
- }
-
- /**
- * Returns block index interval
- *
- * @return blockidx_intvl
- */
- public int getBlockidx_intvl() {
- return blockidx_intvl;
- }
-
- /**
- * Sets block index interval
- *
- * @param blockidx_intvl
- */
- protected void setBlockidx_intvl(int blockidx_intvl) {
- this.blockidx_intvl = blockidx_intvl;
- }
-
- /**
- * Returns an index depth
- *
- * @return index_depth
- */
- public int getIndex_depth() {
- return index_depth;
- }
-
- /**
- * Sets an index depth
- *
- * @param index_depth
- */
- protected void setIndex_depth(int index_depth) {
- this.index_depth = index_depth;
- }
-
- /**
- * Returns index root
- *
- * @return index_root
- */
- public int getIndex_root() {
- return index_root;
- }
-
- /**
- * Sets an index root
- *
- * @param index_root
- */
- protected void setIndex_root(int index_root) {
- this.index_root = index_root;
- }
-
- /**
- * Returns an index head
- *
- * @return index_head
- */
- public int getIndex_head() {
- return index_head;
- }
-
- /**
- * Sets an index head
- *
- * @param index_head
- */
- protected void setIndex_head(int index_head) {
- this.index_head = index_head;
- }
-
- /**
- * Returns 0024 unknown bytes
- *
- * @return unknown_0024
- */
- public int getUnknown_0024() {
- return unknown_0024;
- }
-
- /**
- * Sets 0024 unknown bytes
- *
- * @param unknown_0024
- */
- protected void setUnknown_0024(int unknown_0024) {
- this.unknown_0024 = unknown_0024;
- }
-
- /**
- * Returns number of blocks
- *
- * @return num_blocks
- */
- public long getNum_blocks() {
- return num_blocks;
- }
-
- /**
- * Sets number of blocks containing in the chm file
- *
- * @param num_blocks
- */
- protected void setNum_blocks(long num_blocks) {
- this.num_blocks = num_blocks;
- }
-
- /**
- * Returns 002c unknown bytes
- *
- * @return unknown_002c
- */
- public int getUnknown_002c() {
- return unknown_002c;
- }
-
- /**
- * Sets 002c unknown bytes
- *
- * @param unknown_002c
- */
- protected void setUnknown_002c(int unknown_002c) {
- this.unknown_002c = unknown_002c;
- }
-
- /**
- * Returns language id
- *
- * @return lang_id
- */
- public long getLang_id() {
- return lang_id;
- }
-
- /**
- * Sets language id
- *
- * @param lang_id
- */
- protected void setLang_id(long lang_id) {
- this.lang_id = lang_id;
- }
-
- /**
- * Returns system uuid
- *
- * @return system_uuid
- */
- public byte[] getSystem_uuid() {
- return system_uuid;
- }
-
- /**
- * Sets system uuid
- *
- * @param system_uuid
- */
- protected void setSystem_uuid(byte[] system_uuid) {
- this.system_uuid = system_uuid;
- }
-
- /**
- * Returns 0044 unknown bytes
- *
- * @return unknown_0044
- */
- public byte[] getUnknown_0044() {
- return unknown_0044;
- }
-
- /**
- * Sets 0044 unknown bytes
- *
- * @param unknown_0044
- */
- protected void setUnknown_0044(byte[] unknown_0044) {
- this.unknown_0044 = unknown_0044;
- }
-
- // @Override
- public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
- /* we only know how to deal with the 0x58 and 0x60 byte structures */
- if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
- throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures");
-
- /* unmarshal common fields */
- chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
- // ChmCommons.unmarshalCharArray(data, chmItspHeader,
- // ChmConstants.CHM_SIGNATURE_LEN);
- chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
- chmItspHeader
- .setHeader_len(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getHeader_len()));
- chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getUnknown_000c()));
- chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
- chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
- chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getBlockidx_intvl()));
- chmItspHeader
- .setIndex_depth(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getIndex_depth()));
- chmItspHeader
- .setIndex_root(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getIndex_root()));
- chmItspHeader
- .setIndex_head(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getIndex_head()));
- chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getUnknown_0024()));
- chmItspHeader
- .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getNum_blocks()));
- chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getUnknown_002c())));
- chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
- chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
- chmItspHeader
- .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getSystem_uuid(),
- ChmConstants.BYTE_ARRAY_LENGHT));
- chmItspHeader
- .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
- chmItspHeader.getDataRemained(),
- chmItspHeader.getUnknown_0044(),
- ChmConstants.BYTE_ARRAY_LENGHT));
-
- /* Checks validity of the itsp header */
- if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP))
- throw new ChmParsingException("seems not valid signature");
-
- if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
- throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
-
- if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
- throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Directory header The directory starts with a header; its format is as
+ * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
+ * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
+ * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
+ * Depth of the index tree - 1 there is no index, 2 if there is one level of
+ * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
+ * (though at least one file has 0 despite there being no index chunk, probably
+ * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
+ * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
+ * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
+ * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
+ * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
+ * DWORD -1 (unknown)
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ *
+ */
+public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
+ // TODO: refactor all unmarshals
+ private static final long serialVersionUID = 1962394421998181341L;
+ private byte[] signature;
+ private int version; /* 4 */
+ private int header_len; /* 8 */
+ private int unknown_000c; /* c */
+ private long block_len; /* 10 */
+ private int blockidx_intvl; /* 14 */
+ private int index_depth; /* 18 */
+ private int index_root; /* 1c */
+ private int index_head; /* 20 */
+ private int unknown_0024; /* 24 */
+ private long num_blocks; /* 28 */
+ private int unknown_002c; /* 2c */
+ private long lang_id; /* 30 */
+ private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
+ private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public ChmItspHeader() {
+ signature = ChmConstants.ITSP.getBytes(UTF_8); /*
+ * 0
+ * (ITSP
+ * )
+ */
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("[ signature:=" + new String(getSignature(), UTF_8)
+ + System.getProperty("line.separator"));
+ sb.append("version:=\t" + getVersion()
+ + System.getProperty("line.separator"));
+ sb.append("header_len:=\t" + getHeader_len()
+ + System.getProperty("line.separator"));
+ sb.append("unknown_00c:=\t" + getUnknown_000c()
+ + System.getProperty("line.separator"));
+ sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
+ + System.getProperty("line.separator"));
+ sb.append("blockidx_intvl:=" + getBlockidx_intvl()
+ + ", density of quickref section, usually 2"
+ + System.getProperty("line.separator"));
+ sb.append("index_depth:=\t"
+ + getIndex_depth()
+ + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
+ + System.getProperty("line.separator"));
+ sb.append("index_root:=\t" + getIndex_root()
+ + ", chunk number of root index chunk, -1 if there is none"
+ + System.getProperty("line.separator"));
+ sb.append("index_head:=\t" + getIndex_head()
+ + ", chunk number of first PMGL (listing) chunk"
+ + System.getProperty("line.separator"));
+ sb.append("unknown_0024:=\t" + getUnknown_0024()
+ + ", chunk number of last PMGL (listing) chunk"
+ + System.getProperty("line.separator"));
+ sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
+ + System.getProperty("line.separator"));
+ sb.append("unknown_002c:=\t" + getUnknown_002c()
+ + ", number of directory chunks (total)"
+ + System.getProperty("line.separator"));
+ sb.append("lang_id:=\t" + getLang_id() + " - "
+ + ChmCommons.getLanguage(getLang_id())
+ + System.getProperty("line.separator"));
+ sb.append("system_uuid:=" + getSystem_uuid()
+ + System.getProperty("line.separator"));
+ sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
+ return sb.toString();
+ }
+
+ /**
+ * Copies 4 bits from data[]
+ *
+ * @param data
+ * @param chmItspHeader
+ * @param count
+ * @throws TikaException
+ */
+ private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
+ int count) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmItspHeader);
+ this.setDataRemained(data.length);
+ System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > this.getDataRemained())
+ throw new TikaException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > dataLenght)
+ throw new TikaException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
+ int count) {
+ System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ return dest;
+ }
+
+ /**
+ * Returns how many bytes remained
+ *
+ * @return int
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets how many bytes remained
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns a place holder
+ *
+ * @return current place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ /**
+ * Sets current place
+ *
+ * @param currentPlace
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns a signature of the header
+ *
+ * @return itsp signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets itsp signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns version of itsp header
+ *
+ * @return version
+ */
+ public int getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets a version of itsp header
+ *
+ * @param version
+ */
+ protected void setVersion(int version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns header length
+ *
+ * @return header length
+ */
+ public int getHeader_len() {
+ return header_len;
+ }
+
+ /**
+ * Sets itsp header length
+ *
+ * @param header_len
+ */
+ protected void setHeader_len(int header_len) {
+ this.header_len = header_len;
+ }
+
+ /**
+ * Returns 000c unknown bytes
+ */
+ public int getUnknown_000c() {
+ return unknown_000c;
+ }
+
+ /**
+ * Sets 000c unknown bytes Unknown means here that those guys who cracked
+ * the chm format do not know what's it purposes for
+ *
+ * @param unknown_000c
+ */
+ protected void setUnknown_000c(int unknown_000c) {
+ this.unknown_000c = unknown_000c;
+ }
+
+ /**
+ * Returns block's length
+ *
+ * @return block_length
+ */
+ public long getBlock_len() {
+ return block_len;
+ }
+
+ /**
+ * Sets block length
+ *
+ * @param block_len
+ */
+ protected void setBlock_len(long block_len) {
+ this.block_len = block_len;
+ }
+
+ /**
+ * Returns block index interval
+ *
+ * @return blockidx_intvl
+ */
+ public int getBlockidx_intvl() {
+ return blockidx_intvl;
+ }
+
+ /**
+ * Sets block index interval
+ *
+ * @param blockidx_intvl
+ */
+ protected void setBlockidx_intvl(int blockidx_intvl) {
+ this.blockidx_intvl = blockidx_intvl;
+ }
+
+ /**
+ * Returns an index depth
+ *
+ * @return index_depth
+ */
+ public int getIndex_depth() {
+ return index_depth;
+ }
+
+ /**
+ * Sets an index depth
+ *
+ * @param index_depth
+ */
+ protected void setIndex_depth(int index_depth) {
+ this.index_depth = index_depth;
+ }
+
+ /**
+ * Returns index root
+ *
+ * @return index_root
+ */
+ public int getIndex_root() {
+ return index_root;
+ }
+
+ /**
+ * Sets an index root
+ *
+ * @param index_root
+ */
+ protected void setIndex_root(int index_root) {
+ this.index_root = index_root;
+ }
+
+ /**
+ * Returns an index head
+ *
+ * @return index_head
+ */
+ public int getIndex_head() {
+ return index_head;
+ }
+
+ /**
+ * Sets an index head
+ *
+ * @param index_head
+ */
+ protected void setIndex_head(int index_head) {
+ this.index_head = index_head;
+ }
+
+ /**
+ * Returns 0024 unknown bytes
+ *
+ * @return unknown_0024
+ */
+ public int getUnknown_0024() {
+ return unknown_0024;
+ }
+
+ /**
+ * Sets 0024 unknown bytes
+ *
+ * @param unknown_0024
+ */
+ protected void setUnknown_0024(int unknown_0024) {
+ this.unknown_0024 = unknown_0024;
+ }
+
+ /**
+ * Returns number of blocks
+ *
+ * @return num_blocks
+ */
+ public long getNum_blocks() {
+ return num_blocks;
+ }
+
+ /**
+ * Sets number of blocks containing in the chm file
+ *
+ * @param num_blocks
+ */
+ protected void setNum_blocks(long num_blocks) {
+ this.num_blocks = num_blocks;
+ }
+
+ /**
+ * Returns 002c unknown bytes
+ *
+ * @return unknown_002c
+ */
+ public int getUnknown_002c() {
+ return unknown_002c;
+ }
+
+ /**
+ * Sets 002c unknown bytes
+ *
+ * @param unknown_002c
+ */
+ protected void setUnknown_002c(int unknown_002c) {
+ this.unknown_002c = unknown_002c;
+ }
+
+ /**
+ * Returns language id
+ *
+ * @return lang_id
+ */
+ public long getLang_id() {
+ return lang_id;
+ }
+
+ /**
+ * Sets language id
+ *
+ * @param lang_id
+ */
+ protected void setLang_id(long lang_id) {
+ this.lang_id = lang_id;
+ }
+
+ /**
+ * Returns system uuid
+ *
+ * @return system_uuid
+ */
+ public byte[] getSystem_uuid() {
+ return system_uuid;
+ }
+
+ /**
+ * Sets system uuid
+ *
+ * @param system_uuid
+ */
+ protected void setSystem_uuid(byte[] system_uuid) {
+ this.system_uuid = system_uuid;
+ }
+
+ /**
+ * Returns 0044 unknown bytes
+ *
+ * @return unknown_0044
+ */
+ public byte[] getUnknown_0044() {
+ return unknown_0044;
+ }
+
+ /**
+ * Sets 0044 unknown bytes
+ *
+ * @param unknown_0044
+ */
+ protected void setUnknown_0044(byte[] unknown_0044) {
+ this.unknown_0044 = unknown_0044;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
+ /* we only know how to deal with the 0x58 and 0x60 byte structures */
+ if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
+ throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures");
+
+ /* unmarshal common fields */
+ chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
+ // ChmCommons.unmarshalCharArray(data, chmItspHeader,
+ // ChmConstants.CHM_SIGNATURE_LEN);
+ chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
+ chmItspHeader
+ .setHeader_len(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getHeader_len()));
+ chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_000c()));
+ chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
+ chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getBlockidx_intvl()));
+ chmItspHeader
+ .setIndex_depth(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_depth()));
+ chmItspHeader
+ .setIndex_root(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_root()));
+ chmItspHeader
+ .setIndex_head(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_head()));
+ chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_0024()));
+ chmItspHeader
+ .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getNum_blocks()));
+ chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_002c())));
+ chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
+ chmItspHeader
+ .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getSystem_uuid(),
+ ChmConstants.BYTE_ARRAY_LENGHT));
+ chmItspHeader
+ .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_0044(),
+ ChmConstants.BYTE_ARRAY_LENGHT));
+
+ /* Checks validity of the itsp header */
+ if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP))
+ throw new ChmParsingException("seems not valid signature");
+
+ if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
+ throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
+
+ if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
+ throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
index 6054695..17a2e2f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
@@ -1,319 +1,319 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- *
- * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
- * information on the compression. The information is partially known: 0000:
- * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
- * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
- * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
- * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
- * 001C: DWORD 0 (unknown)
- *
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?page=2 }
- *
- */
-public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
- private static final long serialVersionUID = -7897854774939631565L;
- /* class' members */
- private long size; /* 0 */
- private byte[] signature;
- private long version; /* 8 */
- private long resetInterval; /* c */
- private long windowSize; /* 10 */
- private long windowsPerReset; /* 14 */
- private long unknown_18; /* 18 */
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- public ChmLzxcControlData() {
- signature = ChmConstants.LZXC.getBytes(UTF_8); /*
- * 4
- * (LZXC
- * )
- */
- }
-
- /**
- * Returns a remained data
- *
- * @return dataRemained
- */
- private int getDataRemained() {
- return dataRemained;
- }
-
- /**
- * Sets a remained data
- *
- * @param dataRemained
- */
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- /**
- * Returns a place holder
- *
- * @return current_place
- */
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- /**
- * Sets a place holder
- *
- * @param current_place
- */
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- /**
- * Returns a size of control data
- *
- * @return size
- */
- public long getSize() {
- return size;
- }
-
- /**
- * Sets a size of control data
- *
- * @param size
- */
- protected void setSize(long size) {
- this.size = size;
- }
-
- /**
- * Returns a signature of control data block
- *
- * @return signature
- */
- public byte[] getSignature() {
- return signature;
- }
-
- /**
- * Sets a signature of control data block
- *
- * @param signature
- */
- protected void setSignature(byte[] signature) {
- this.signature = signature;
- }
-
- /**
- * Returns a version of control data block
- *
- * @return version
- */
- public long getVersion() {
- return version;
- }
-
- /**
- * Sets version of control data block
- *
- * @param version
- */
- protected void setVersion(long version) {
- this.version = version;
- }
-
- /**
- * Returns reset interval
- *
- * @return reset_interval
- */
- public long getResetInterval() {
- return resetInterval;
- }
-
- /**
- * Sets a reset interval
- *
- * @param resetInterval
- */
- protected void setResetInterval(long resetInterval) {
- this.resetInterval = resetInterval;
- }
-
- /**
- * Returns a window size
- *
- * @return window_size
- */
- public long getWindowSize() {
- return windowSize;
- }
-
- /**
- * Sets a window size
- *
- * @param window_size
- */
- protected void setWindowSize(long windowSize) {
- this.windowSize = windowSize;
- }
-
- /**
- * Returns windows per reset
- *
- * @return
- */
- public long getWindowsPerReset() {
- return windowsPerReset;
- }
-
- /**
- * Sets windows per reset
- *
- * @param windows_per_reset
- */
- protected void setWindowsPerReset(long windowsPerReset) {
- this.windowsPerReset = windowsPerReset;
- }
-
- /**
- * Returns unknown 18 bytes
- *
- * @return unknown_18
- */
- public long getUnknown_18() {
- return unknown_18;
- }
-
- /**
- * Sets unknown 18 bytes
- *
- * @param unknown_18
- */
- protected void setUnknown_18(long unknown_18) {
- this.unknown_18 = unknown_18;
- }
-
- private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
- assert (data != null && data.length > 0);
- if (4 > getDataRemained())
- throw new ChmParsingException("4 > dataLenght");
- dest = data[this.getCurrentPlace()]
- | data[this.getCurrentPlace() + 1] << 8
- | data[this.getCurrentPlace() + 2] << 16
- | data[this.getCurrentPlace() + 3] << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- private void unmarshalCharArray(byte[] data,
- ChmLzxcControlData chmLzxcControlData, int count) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
- ChmAssert.assertPositiveInt(count);
- System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- }
-
- /**
- * Returns textual representation of ChmLzxcControlData
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("size(unknown):=" + this.getSize() + ", ");
- sb.append("signature(Compression type identifier):="
- + new String(this.getSignature(), UTF_8) + ", ");
- sb.append("version(Possibly numeric code for LZX):="
- + this.getVersion() + System.getProperty("line.separator"));
- sb.append("resetInterval(The Huffman reset interval):="
- + this.getResetInterval() + ", ");
- sb.append("windowSize:=" + this.getWindowSize() + ", ");
- sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
- + this.getWindowsPerReset() + ", ");
- sb.append("unknown_18:=" + this.getUnknown_18()
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- // @Override
- public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException {
- if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
- throw new ChmParsingException("we want at least 0x18 bytes");
- chmLzxcControlData.setDataRemained(data.length);
- chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize()));
- chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
- ChmConstants.CHM_SIGNATURE_LEN);
- chmLzxcControlData.setVersion(unmarshalUInt32(data,
- chmLzxcControlData.getVersion()));
- chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
- chmLzxcControlData.getResetInterval()));
- chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
- chmLzxcControlData.getWindowSize()));
- chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
- chmLzxcControlData.getWindowsPerReset()));
-
- if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
- chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
- chmLzxcControlData.getUnknown_18()));
- else
- chmLzxcControlData.setUnknown_18(0);
-
- if (chmLzxcControlData.getVersion() == 2) {
- chmLzxcControlData.setWindowSize(getWindowSize()
- * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
- }
-
- if (chmLzxcControlData.getWindowSize() == 0
- || chmLzxcControlData.getResetInterval() == 0)
- throw new ChmParsingException(
- "window size / resetInterval should be more than zero");
-
- if (chmLzxcControlData.getWindowSize() == 1)
- throw new ChmParsingException(
- "window size / resetInterval should be more than 1");
-
- /* checks a signature */
- if (!new String(chmLzxcControlData.getSignature(), UTF_8)
- .equals(ChmConstants.LZXC))
- throw new ChmParsingException(
- "the signature does not seem to be correct");
- }
-
- /**
- * @param args
- */
- public static void main(String[] args) {
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ *
+ * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
+ * information on the compression. The information is partially known: 0000:
+ * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
+ * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
+ * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
+ * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
+ * 001C: DWORD 0 (unknown)
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ *
+ */
+public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
+ private static final long serialVersionUID = -7897854774939631565L;
+ /* class' members */
+ private long size; /* 0 */
+ private byte[] signature;
+ private long version; /* 8 */
+ private long resetInterval; /* c */
+ private long windowSize; /* 10 */
+ private long windowsPerReset; /* 14 */
+ private long unknown_18; /* 18 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public ChmLzxcControlData() {
+ signature = ChmConstants.LZXC.getBytes(UTF_8); /*
+ * 4
+ * (LZXC
+ * )
+ */
+ }
+
+ /**
+ * Returns a remained data
+ *
+ * @return dataRemained
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets a remained data
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns a place holder
+ *
+ * @return current_place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ /**
+ * Sets a place holder
+ *
+ * @param current_place
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns a size of control data
+ *
+ * @return size
+ */
+ public long getSize() {
+ return size;
+ }
+
+ /**
+ * Sets a size of control data
+ *
+ * @param size
+ */
+ protected void setSize(long size) {
+ this.size = size;
+ }
+
+ /**
+ * Returns a signature of control data block
+ *
+ * @return signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets a signature of control data block
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns a version of control data block
+ *
+ * @return version
+ */
+ public long getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets version of control data block
+ *
+ * @param version
+ */
+ protected void setVersion(long version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns reset interval
+ *
+ * @return reset_interval
+ */
+ public long getResetInterval() {
+ return resetInterval;
+ }
+
+ /**
+ * Sets a reset interval
+ *
+ * @param resetInterval
+ */
+ protected void setResetInterval(long resetInterval) {
+ this.resetInterval = resetInterval;
+ }
+
+ /**
+ * Returns a window size
+ *
+ * @return window_size
+ */
+ public long getWindowSize() {
+ return windowSize;
+ }
+
+ /**
+ * Sets a window size
+ *
+ * @param window_size
+ */
+ protected void setWindowSize(long windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ /**
+ * Returns windows per reset
+ *
+ * @return
+ */
+ public long getWindowsPerReset() {
+ return windowsPerReset;
+ }
+
+ /**
+ * Sets windows per reset
+ *
+ * @param windows_per_reset
+ */
+ protected void setWindowsPerReset(long windowsPerReset) {
+ this.windowsPerReset = windowsPerReset;
+ }
+
+ /**
+ * Returns unknown 18 bytes
+ *
+ * @return unknown_18
+ */
+ public long getUnknown_18() {
+ return unknown_18;
+ }
+
+ /**
+ * Sets unknown 18 bytes
+ *
+ * @param unknown_18
+ */
+ protected void setUnknown_18(long unknown_18) {
+ this.unknown_18 = unknown_18;
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
+ assert (data != null && data.length > 0);
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private void unmarshalCharArray(byte[] data,
+ ChmLzxcControlData chmLzxcControlData, int count) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
+ ChmAssert.assertPositiveInt(count);
+ System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ /**
+ * Returns textual representation of ChmLzxcControlData
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("size(unknown):=" + this.getSize() + ", ");
+ sb.append("signature(Compression type identifier):="
+ + new String(this.getSignature(), UTF_8) + ", ");
+ sb.append("version(Possibly numeric code for LZX):="
+ + this.getVersion() + System.getProperty("line.separator"));
+ sb.append("resetInterval(The Huffman reset interval):="
+ + this.getResetInterval() + ", ");
+ sb.append("windowSize:=" + this.getWindowSize() + ", ");
+ sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
+ + this.getWindowsPerReset() + ", ");
+ sb.append("unknown_18:=" + this.getUnknown_18()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException {
+ if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
+ throw new ChmParsingException("we want at least 0x18 bytes");
+ chmLzxcControlData.setDataRemained(data.length);
+ chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize()));
+ chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmLzxcControlData.setVersion(unmarshalUInt32(data,
+ chmLzxcControlData.getVersion()));
+ chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
+ chmLzxcControlData.getResetInterval()));
+ chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
+ chmLzxcControlData.getWindowSize()));
+ chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
+ chmLzxcControlData.getWindowsPerReset()));
+
+ if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
+ chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
+ chmLzxcControlData.getUnknown_18()));
+ else
+ chmLzxcControlData.setUnknown_18(0);
+
+ if (chmLzxcControlData.getVersion() == 2) {
+ chmLzxcControlData.setWindowSize(getWindowSize()
+ * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
+ }
+
+ if (chmLzxcControlData.getWindowSize() == 0
+ || chmLzxcControlData.getResetInterval() == 0)
+ throw new ChmParsingException(
+ "window size / resetInterval should be more than zero");
+
+ if (chmLzxcControlData.getWindowSize() == 1)
+ throw new ChmParsingException(
+ "window size / resetInterval should be more than 1");
+
+ /* checks a signature */
+ if (!new String(chmLzxcControlData.getSignature(), UTF_8)
+ .equals(ChmConstants.LZXC))
+ throw new ChmParsingException(
+ "the signature does not seem to be correct");
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
index d6b5328..5823f67 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
@@ -1,341 +1,341 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * LZXC reset table For ensuring a decompression. Reads the block named
- * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
- * .
- *
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?page=2 }
- *
- */
-public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
- private static final long serialVersionUID = -8209574429411707460L;
- /* class members */
- private long version; // 0000: DWORD 2 unknown (possibly a version number)
- private long block_count; // 0004: DWORD Number of entries in reset table
- private long unknown; // 0008: DWORD 8 unknown
- private long table_offset; // 000C: DWORD $28 Length of table header (area
- // before table entries)
- private long uncompressed_len; // 0010: QWORD Uncompressed Length
- private long compressed_len; // 0018: QWORD Compressed Length
- private long block_len; // 0020: QWORD 0x8000 block size for locations below
- private long[] block_address;
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- private int getDataRemained() {
- return dataRemained;
- }
-
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- /**
- * Returns block addresses
- *
- * @return block addresses
- */
- public long[] getBlockAddress() {
- return block_address;
- }
-
- /**
- * Sets block addresses
- *
- * @param block_address
- */
- public void setBlockAddress(long[] block_address) {
- this.block_address = block_address;
- }
-
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("version:=" + getVersion()
- + System.getProperty("line.separator"));
- sb.append("block_count:=" + getBlockCount()
- + System.getProperty("line.separator"));
- sb.append("unknown:=" + getUnknown()
- + System.getProperty("line.separator"));
- sb.append("table_offset:=" + getTableOffset()
- + System.getProperty("line.separator"));
- sb.append("uncompressed_len:=" + getUncompressedLen()
- + System.getProperty("line.separator"));
- sb.append("compressed_len:=" + getCompressedLen()
- + System.getProperty("line.separator"));
- sb.append("block_len:=" + getBlockLen()
- + System.getProperty("line.separator"));
- sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
- return sb.toString();
- }
-
- /**
- * Enumerates chm block addresses
- *
- * @param data
- *
- * @return byte[] of addresses
- * @throws TikaException
- */
- private long[] enumerateBlockAddresses(byte[] data) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- /* we have limit of number of blocks to be extracted */
- if (getBlockCount() > 5000)
- setBlockCount(5000);
-
- if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
- setBlockCount(getDataRemained() / 8);
-
- long[] addresses = new long[(int) getBlockCount()];
- int rem = getDataRemained() / 8;
- for (int i = 0; i < rem; i++) {
- long num = -1;
-
- try {
- addresses[i] = unmarshalUint64(data, num);
- } catch (Exception e) {
- throw new TikaException(e.getMessage());
- }
- }
- return addresses;
- }
-
- /**
- * Validates parameters such as byte[] and chm lzxc reset table
- *
- * @param data
- * @param chmLzxcResetTable
- *
- * @return boolean
- * @throws TikaException
- */
- private boolean validateParamaters(byte[] data,
- ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
- int goodParameter = 0;
- ChmAssert.assertByteArrayNotNull(data);
- ++goodParameter;
- ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
- ++goodParameter;
- return (goodParameter == 2);
- }
-
- private long unmarshalUInt32(byte[] data, long dest) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- private long unmarshalUint64(byte[] data, long dest) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- byte[] temp = new byte[8];
- int i, j;// counters
-
- for (i = 8, j = 7; i > 0; i--) {
- if (data.length > this.getCurrentPlace()) {
- temp[j--] = data[this.getCurrentPlace()];
- this.setCurrentPlace(this.getCurrentPlace() + 1);
- } else
- throw new TikaException("data is too small to calculate address block");
- }
- dest = new BigInteger(temp).longValue();
- this.setDataRemained(this.getDataRemained() - 8);
- return dest;
- }
-
- /**
- * Returns the version
- *
- * @return - long
- */
- public long getVersion() {
- return version;
- }
-
- /**
- * Sets the version
- *
- * @param version
- * - long
- */
- public void setVersion(long version) {
- this.version = version;
- }
-
- /**
- * Gets a block count
- *
- * @return - int
- */
- public long getBlockCount() {
- return block_count;
- }
-
- /**
- * Sets a block count
- *
- * @param block_count
- * - long
- */
- public void setBlockCount(long block_count) {
- this.block_count = block_count;
- }
-
- /**
- * Gets unknown
- *
- * @return - long
- */
- public long getUnknown() {
- return unknown;
- }
-
- /**
- * Sets an unknown
- *
- * @param unknown
- * - long
- */
- public void setUnknown(long unknown) {
- this.unknown = unknown;
- }
-
- /**
- * Gets a table offset
- *
- * @return - long
- */
- public long getTableOffset() {
- return table_offset;
- }
-
- /**
- * Sets a table offset
- *
- * @param table_offset
- * - long
- */
- public void setTableOffset(long table_offset) {
- this.table_offset = table_offset;
- }
-
- /**
- * Gets uncompressed length
- *
- * @return - {@link BigInteger }
- */
- public long getUncompressedLen() {
- return uncompressed_len;
- }
-
- /**
- * Sets uncompressed length
- *
- * @param uncompressed_len
- * - {@link BigInteger}
- */
- public void setUncompressedLen(long uncompressed_len) {
- this.uncompressed_len = uncompressed_len;
- }
-
- /**
- * Gets compressed length
- *
- * @return - {@link BigInteger}
- */
- public long getCompressedLen() {
- return compressed_len;
- }
-
- /**
- * Sets compressed length
- *
- * @param compressed_len
- * - {@link BigInteger}
- */
- public void setCompressedLen(long compressed_len) {
- this.compressed_len = compressed_len;
- }
-
- /**
- * Gets a block length
- *
- * @return - {@link BigInteger}
- */
- public long getBlockLen() {
- return block_len;
- }
-
- /**
- * Sets a block length
- *
- * @param block_len
- * - {@link BigInteger}
- */
- public void setBlockLlen(long block_len) {
- this.block_len = block_len;
- }
-
- // @Override
- public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
- setDataRemained(data.length);
- if (validateParamaters(data, chmLzxcResetTable)) {
- /* unmarshal fields */
- chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion()));
- chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount()));
- chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown()));
- chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset()));
- chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen()));
- chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen()));
- chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen()));
- chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
- }
-
- /* checks chmLzxcResetTable */
- if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
- throw new ChmParsingException(
- "does not seem currect version of chmLzxcResetTable");
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * LZXC reset table For ensuring a decompression. Reads the block named
+ * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
+ * .
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ *
+ */
+public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
+ private static final long serialVersionUID = -8209574429411707460L;
+ /* class members */
+ private long version; // 0000: DWORD 2 unknown (possibly a version number)
+ private long block_count; // 0004: DWORD Number of entries in reset table
+ private long unknown; // 0008: DWORD 8 unknown
+ private long table_offset; // 000C: DWORD $28 Length of table header (area
+ // before table entries)
+ private long uncompressed_len; // 0010: QWORD Uncompressed Length
+ private long compressed_len; // 0018: QWORD Compressed Length
+ private long block_len; // 0020: QWORD 0x8000 block size for locations below
+ private long[] block_address;
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns block addresses
+ *
+ * @return block addresses
+ */
+ public long[] getBlockAddress() {
+ return block_address;
+ }
+
+ /**
+ * Sets block addresses
+ *
+ * @param block_address
+ */
+ public void setBlockAddress(long[] block_address) {
+ this.block_address = block_address;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("version:=" + getVersion()
+ + System.getProperty("line.separator"));
+ sb.append("block_count:=" + getBlockCount()
+ + System.getProperty("line.separator"));
+ sb.append("unknown:=" + getUnknown()
+ + System.getProperty("line.separator"));
+ sb.append("table_offset:=" + getTableOffset()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed_len:=" + getUncompressedLen()
+ + System.getProperty("line.separator"));
+ sb.append("compressed_len:=" + getCompressedLen()
+ + System.getProperty("line.separator"));
+ sb.append("block_len:=" + getBlockLen()
+ + System.getProperty("line.separator"));
+ sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
+ return sb.toString();
+ }
+
+ /**
+ * Enumerates chm block addresses
+ *
+ * @param data
+ *
+ * @return byte[] of addresses
+ * @throws TikaException
+ */
+ private long[] enumerateBlockAddresses(byte[] data) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ /* we have limit of number of blocks to be extracted */
+ if (getBlockCount() > 5000)
+ setBlockCount(5000);
+
+ if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
+ setBlockCount(getDataRemained() / 8);
+
+ long[] addresses = new long[(int) getBlockCount()];
+ int rem = getDataRemained() / 8;
+ for (int i = 0; i < rem; i++) {
+ long num = -1;
+
+ try {
+ addresses[i] = unmarshalUint64(data, num);
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage());
+ }
+ }
+ return addresses;
+ }
+
+ /**
+ * Validates parameters such as byte[] and chm lzxc reset table
+ *
+ * @param data
+ * @param chmLzxcResetTable
+ *
+ * @return boolean
+ * @throws TikaException
+ */
+ private boolean validateParamaters(byte[] data,
+ ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
+ int goodParameter = 0;
+ ChmAssert.assertByteArrayNotNull(data);
+ ++goodParameter;
+ ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
+ ++goodParameter;
+ return (goodParameter == 2);
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private long unmarshalUint64(byte[] data, long dest) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ byte[] temp = new byte[8];
+ int i, j;// counters
+
+ for (i = 8, j = 7; i > 0; i--) {
+ if (data.length > this.getCurrentPlace()) {
+ temp[j--] = data[this.getCurrentPlace()];
+ this.setCurrentPlace(this.getCurrentPlace() + 1);
+ } else
+ throw new TikaException("data is too small to calculate address block");
+ }
+ dest = new BigInteger(temp).longValue();
+ this.setDataRemained(this.getDataRemained() - 8);
+ return dest;
+ }
+
+ /**
+ * Returns the version
+ *
+ * @return - long
+ */
+ public long getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets the version
+ *
+ * @param version
+ * - long
+ */
+ public void setVersion(long version) {
+ this.version = version;
+ }
+
+ /**
+ * Gets a block count
+ *
+ * @return - int
+ */
+ public long getBlockCount() {
+ return block_count;
+ }
+
+ /**
+ * Sets a block count
+ *
+ * @param block_count
+ * - long
+ */
+ public void setBlockCount(long block_count) {
+ this.block_count = block_count;
+ }
+
+ /**
+ * Gets unknown
+ *
+ * @return - long
+ */
+ public long getUnknown() {
+ return unknown;
+ }
+
+ /**
+ * Sets an unknown
+ *
+ * @param unknown
+ * - long
+ */
+ public void setUnknown(long unknown) {
+ this.unknown = unknown;
+ }
+
+ /**
+ * Gets a table offset
+ *
+ * @return - long
+ */
+ public long getTableOffset() {
+ return table_offset;
+ }
+
+ /**
+ * Sets a table offset
+ *
+ * @param table_offset
+ * - long
+ */
+ public void setTableOffset(long table_offset) {
+ this.table_offset = table_offset;
+ }
+
+ /**
+ * Gets uncompressed length
+ *
+ * @return - {@link BigInteger }
+ */
+ public long getUncompressedLen() {
+ return uncompressed_len;
+ }
+
+ /**
+ * Sets uncompressed length
+ *
+ * @param uncompressed_len
+ * - {@link BigInteger}
+ */
+ public void setUncompressedLen(long uncompressed_len) {
+ this.uncompressed_len = uncompressed_len;
+ }
+
+ /**
+ * Gets compressed length
+ *
+ * @return - {@link BigInteger}
+ */
+ public long getCompressedLen() {
+ return compressed_len;
+ }
+
+ /**
+ * Sets compressed length
+ *
+ * @param compressed_len
+ * - {@link BigInteger}
+ */
+ public void setCompressedLen(long compressed_len) {
+ this.compressed_len = compressed_len;
+ }
+
+ /**
+ * Gets a block length
+ *
+ * @return - {@link BigInteger}
+ */
+ public long getBlockLen() {
+ return block_len;
+ }
+
+ /**
+ * Sets a block length
+ *
+ * @param block_len
+ * - {@link BigInteger}
+ */
+ public void setBlockLlen(long block_len) {
+ this.block_len = block_len;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
+ setDataRemained(data.length);
+ if (validateParamaters(data, chmLzxcResetTable)) {
+ /* unmarshal fields */
+ chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion()));
+ chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount()));
+ chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown()));
+ chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset()));
+ chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen()));
+ chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen()));
+ chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen()));
+ chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
+ }
+
+ /* checks chmLzxcResetTable */
+ if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
+ throw new ChmParsingException(
+ "does not seem currect version of chmLzxcResetTable");
+ }
+}
[04/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index f43fdc0..4d5cc46 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -1,347 +1,347 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-import java.util.Locale;
-
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.document.TextBlock;
-import de.l3s.boilerpipe.document.TextDocument;
-import de.l3s.boilerpipe.extractors.ArticleExtractor;
-import de.l3s.boilerpipe.extractors.DefaultExtractor;
-import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
- * library to automatically extract the main content from a web page.
- * <p/>
- * Use this as a {@link ContentHandler} object passed to
- * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
- */
-public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
-
- /**
- * The newline character that gets inserted after block elements.
- */
- private static final char[] NL = new char[]{'\n'};
- private ContentHandler delegate;
- private BoilerpipeExtractor extractor;
- private boolean includeMarkup;
- private boolean inHeader;
- private boolean inFooter;
- private int headerCharOffset;
- private List<RecordedElement> elements;
- private TextDocument td;
- /**
- * Creates a new boilerpipe-based content extractor, using the
- * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
- *
- * @param delegate The {@link ContentHandler} object
- */
- public BoilerpipeContentHandler(ContentHandler delegate) {
- this(delegate, DefaultExtractor.INSTANCE);
- }
-
- /**
- * Creates a content handler that writes XHTML body character events to
- * the given writer.
- *
- * @param writer writer
- */
- public BoilerpipeContentHandler(Writer writer) {
- this(new WriteOutContentHandler(writer));
- }
-
- /**
- * Creates a new boilerpipe-based content extractor, using the given
- * extraction rules. The extracted main content will be passed to the
- * <delegate> content handler.
- *
- * @param delegate The {@link ContentHandler} object
- * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
- */
- public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
- this.td = null;
- this.delegate = delegate;
- this.extractor = extractor;
- }
-
- public boolean isIncludeMarkup() {
- return includeMarkup;
- }
-
- public void setIncludeMarkup(boolean includeMarkup) {
- this.includeMarkup = includeMarkup;
- }
-
- /**
- * Retrieves the built TextDocument
- *
- * @return TextDocument
- */
- public TextDocument getTextDocument() {
- return td;
- }
-
- @Override
- public void startDocument() throws SAXException {
- super.startDocument();
-
- delegate.startDocument();
-
- inHeader = true;
- inFooter = false;
- headerCharOffset = 0;
-
- if (includeMarkup) {
- elements = new ArrayList<RecordedElement>();
- }
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- super.startPrefixMapping(prefix, uri);
- delegate.startPrefixMapping(prefix, uri);
- }
-
- ;
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- super.startElement(uri, localName, qName, atts);
-
- if (inHeader) {
- delegate.startElement(uri, localName, qName, atts);
- } else if (inFooter) {
- // Do nothing
- } else if (includeMarkup) {
- elements.add(new RecordedElement(uri, localName, qName, atts));
- } else {
- // This happens for the <body> element, if we're not doing markup.
- delegate.startElement(uri, localName, qName, atts);
- }
- }
-
- ;
-
- @Override
- public void characters(char[] chars, int offset, int length) throws SAXException {
- super.characters(chars, offset, length);
-
- if (inHeader) {
- delegate.characters(chars, offset, length);
- headerCharOffset++;
- } else if (inFooter) {
- // Do nothing
- } else if (includeMarkup) {
- RecordedElement element = elements.get(elements.size() - 1);
-
- char[] characters = new char[length];
- System.arraycopy(chars, offset, characters, 0, length);
- element.getCharacters().add(characters);
- }
- }
-
- ;
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- super.endElement(uri, localName, qName);
-
- if (inHeader) {
- delegate.endElement(uri, localName, qName);
- inHeader = !localName.equals("head");
- } else if (inFooter) {
- // Do nothing
- } else if (localName.equals("body")) {
- inFooter = true;
- } else if (includeMarkup) {
- // Add the end element, and the continuation from the previous element
- elements.add(new RecordedElement(uri, localName, qName));
- elements.add(new RecordedElement());
- }
- }
-
- ;
-
- @Override
- public void endDocument() throws SAXException {
- super.endDocument();
-
- td = toTextDocument();
- try {
- extractor.process(td);
- } catch (BoilerpipeProcessingException e) {
- throw new SAXException(e);
- }
-
- Attributes emptyAttrs = new AttributesImpl();
-
- // At this point we have all the information we need to either emit N paragraphs
- // of plain text (if not including markup), or we have to replay our recorded elements
- // and only emit character runs that passed the boilerpipe filters.
- if (includeMarkup) {
- BitSet validCharacterRuns = new BitSet();
- for (TextBlock block : td.getTextBlocks()) {
- if (block.isContent()) {
- BitSet bs = block.getContainedTextElements();
- if (bs != null) {
- validCharacterRuns.or(bs);
- }
- }
- }
-
- // Now have bits set for all valid character runs. Replay our recorded elements,
- // but only emit character runs flagged as valid.
- int curCharsIndex = headerCharOffset;
-
- for (RecordedElement element : elements) {
- switch (element.getElementType()) {
- case START:
- delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
- // Fall through
-
- case CONTINUE:
- // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
- // we have to follow suit.
- for (char[] chars : element.getCharacters()) {
- curCharsIndex++;
-
- if (validCharacterRuns.get(curCharsIndex)) {
- delegate.characters(chars, 0, chars.length);
-
- // https://issues.apache.org/jira/browse/TIKA-961
- if (!Character.isWhitespace(chars[chars.length - 1])) {
- // Only add whitespace for certain elements
- if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
- delegate.ignorableWhitespace(NL, 0, NL.length);
- }
- }
- }
- }
- break;
-
- case END:
- delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
- break;
-
- default:
- throw new RuntimeException("Unhandled element type: " + element.getElementType());
- }
-
-
- }
- } else {
- for (TextBlock block : td.getTextBlocks()) {
- if (block.isContent()) {
- delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
- char[] chars = block.getText().toCharArray();
- delegate.characters(chars, 0, chars.length);
- delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
- delegate.ignorableWhitespace(NL, 0, NL.length);
- }
- }
- }
-
- delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
- delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
-
- // We defer ending any prefix mapping until here, which is why we don't pass this
- // through to the delegate in an overridden method.
- delegate.endPrefixMapping("");
-
- delegate.endDocument();
- }
-
- ;
-
- private static class RecordedElement {
- private String uri;
- private String localName;
- private String qName;
- private Attributes attrs;
- private List<char[]> characters;
- private ElementType elementType;
- public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
- this(uri, localName, qName, attrs, ElementType.START);
- }
-
- public RecordedElement(String uri, String localName, String qName) {
- this(uri, localName, qName, null, ElementType.END);
- }
-
- public RecordedElement() {
- this(null, null, null, null, ElementType.CONTINUE);
- }
-
- protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
- this.uri = uri;
- this.localName = localName;
- this.qName = qName;
- this.attrs = attrs;
- this.elementType = elementType;
- this.characters = new ArrayList<char[]>();
- }
-
- @Override
- public String toString() {
- return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
- }
-
- public String getUri() {
- return uri;
- }
-
- public String getLocalName() {
- return localName;
- }
-
- public String getQName() {
- return qName;
- }
-
- public Attributes getAttrs() {
- return attrs;
- }
-
- public List<char[]> getCharacters() {
- return characters;
- }
-
- public RecordedElement.ElementType getElementType() {
- return elementType;
- }
-
- public enum ElementType {
- START,
- END,
- CONTINUE
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+ /**
+ * The newline character that gets inserted after block elements.
+ */
+ private static final char[] NL = new char[]{'\n'};
+ private ContentHandler delegate;
+ private BoilerpipeExtractor extractor;
+ private boolean includeMarkup;
+ private boolean inHeader;
+ private boolean inFooter;
+ private int headerCharOffset;
+ private List<RecordedElement> elements;
+ private TextDocument td;
+ /**
+ * Creates a new boilerpipe-based content extractor, using the
+ * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate) {
+ this(delegate, DefaultExtractor.INSTANCE);
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BoilerpipeContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the given
+ * extraction rules. The extracted main content will be passed to the
+ * <delegate> content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+ this.td = null;
+ this.delegate = delegate;
+ this.extractor = extractor;
+ }
+
+ public boolean isIncludeMarkup() {
+ return includeMarkup;
+ }
+
+ public void setIncludeMarkup(boolean includeMarkup) {
+ this.includeMarkup = includeMarkup;
+ }
+
+ /**
+ * Retrieves the built TextDocument
+ *
+ * @return TextDocument
+ */
+ public TextDocument getTextDocument() {
+ return td;
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+
+ delegate.startDocument();
+
+ inHeader = true;
+ inFooter = false;
+ headerCharOffset = 0;
+
+ if (includeMarkup) {
+ elements = new ArrayList<RecordedElement>();
+ }
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ super.startPrefixMapping(prefix, uri);
+ delegate.startPrefixMapping(prefix, uri);
+ }
+
+ ;
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ super.startElement(uri, localName, qName, atts);
+
+ if (inHeader) {
+ delegate.startElement(uri, localName, qName, atts);
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ elements.add(new RecordedElement(uri, localName, qName, atts));
+ } else {
+ // This happens for the <body> element, if we're not doing markup.
+ delegate.startElement(uri, localName, qName, atts);
+ }
+ }
+
+ ;
+
+ @Override
+ public void characters(char[] chars, int offset, int length) throws SAXException {
+ super.characters(chars, offset, length);
+
+ if (inHeader) {
+ delegate.characters(chars, offset, length);
+ headerCharOffset++;
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ RecordedElement element = elements.get(elements.size() - 1);
+
+ char[] characters = new char[length];
+ System.arraycopy(chars, offset, characters, 0, length);
+ element.getCharacters().add(characters);
+ }
+ }
+
+ ;
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ super.endElement(uri, localName, qName);
+
+ if (inHeader) {
+ delegate.endElement(uri, localName, qName);
+ inHeader = !localName.equals("head");
+ } else if (inFooter) {
+ // Do nothing
+ } else if (localName.equals("body")) {
+ inFooter = true;
+ } else if (includeMarkup) {
+ // Add the end element, and the continuation from the previous element
+ elements.add(new RecordedElement(uri, localName, qName));
+ elements.add(new RecordedElement());
+ }
+ }
+
+ ;
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+
+ td = toTextDocument();
+ try {
+ extractor.process(td);
+ } catch (BoilerpipeProcessingException e) {
+ throw new SAXException(e);
+ }
+
+ Attributes emptyAttrs = new AttributesImpl();
+
+ // At this point we have all the information we need to either emit N paragraphs
+ // of plain text (if not including markup), or we have to replay our recorded elements
+ // and only emit character runs that passed the boilerpipe filters.
+ if (includeMarkup) {
+ BitSet validCharacterRuns = new BitSet();
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ validCharacterRuns.or(bs);
+ }
+ }
+ }
+
+ // Now have bits set for all valid character runs. Replay our recorded elements,
+ // but only emit character runs flagged as valid.
+ int curCharsIndex = headerCharOffset;
+
+ for (RecordedElement element : elements) {
+ switch (element.getElementType()) {
+ case START:
+ delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+ // Fall through
+
+ case CONTINUE:
+ // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+ // we have to follow suit.
+ for (char[] chars : element.getCharacters()) {
+ curCharsIndex++;
+
+ if (validCharacterRuns.get(curCharsIndex)) {
+ delegate.characters(chars, 0, chars.length);
+
+ // https://issues.apache.org/jira/browse/TIKA-961
+ if (!Character.isWhitespace(chars[chars.length - 1])) {
+ // Only add whitespace for certain elements
+ if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+ }
+ break;
+
+ case END:
+ delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+ break;
+
+ default:
+ throw new RuntimeException("Unhandled element type: " + element.getElementType());
+ }
+
+
+ }
+ } else {
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+ char[] chars = block.getText().toCharArray();
+ delegate.characters(chars, 0, chars.length);
+ delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+
+ delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+ delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+ // We defer ending any prefix mapping until here, which is why we don't pass this
+ // through to the delegate in an overridden method.
+ delegate.endPrefixMapping("");
+
+ delegate.endDocument();
+ }
+
+ ;
+
+ private static class RecordedElement {
+ private String uri;
+ private String localName;
+ private String qName;
+ private Attributes attrs;
+ private List<char[]> characters;
+ private ElementType elementType;
+ public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+ this(uri, localName, qName, attrs, ElementType.START);
+ }
+
+ public RecordedElement(String uri, String localName, String qName) {
+ this(uri, localName, qName, null, ElementType.END);
+ }
+
+ public RecordedElement() {
+ this(null, null, null, null, ElementType.CONTINUE);
+ }
+
+ protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+ this.uri = uri;
+ this.localName = localName;
+ this.qName = qName;
+ this.attrs = attrs;
+ this.elementType = elementType;
+ this.characters = new ArrayList<char[]>();
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getLocalName() {
+ return localName;
+ }
+
+ public String getQName() {
+ return qName;
+ }
+
+ public Attributes getAttrs() {
+ return attrs;
+ }
+
+ public List<char[]> getCharacters() {
+ return characters;
+ }
+
+ public RecordedElement.ElementType getElementType() {
+ return elementType;
+ }
+
+ public enum ElementType {
+ START,
+ END,
+ CONTINUE
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
index 0cef05f..4217ac5 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
@@ -1,137 +1,137 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-@SuppressWarnings("serial")
-public class DefaultHtmlMapper implements HtmlMapper {
-
- /**
- * @since Apache Tika 0.8
- */
- public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
- private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
- put("H1", "h1");
- put("H2", "h2");
- put("H3", "h3");
- put("H4", "h4");
- put("H5", "h5");
- put("H6", "h6");
-
- put("P", "p");
- put("PRE", "pre");
- put("BLOCKQUOTE", "blockquote");
- put("Q", "q");
-
- put("UL", "ul");
- put("OL", "ol");
- put("MENU", "ul");
- put("LI", "li");
- put("DL", "dl");
- put("DT", "dt");
- put("DD", "dd");
-
- put("TABLE", "table");
- put("THEAD", "thead");
- put("TBODY", "tbody");
- put("TR", "tr");
- put("TH", "th");
- put("TD", "td");
-
- put("ADDRESS", "address");
-
- // TIKA-460 - add anchors
- put("A", "a");
-
- // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
- put("MAP", "map");
- put("AREA", "area");
- put("IMG", "img");
- put("FRAMESET", "frameset");
- put("FRAME", "frame");
- put("IFRAME", "iframe");
- put("OBJECT", "object");
- put("PARAM", "param");
- put("INS", "ins");
- put("DEL", "del");
- }};
- private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
- add("STYLE");
- add("SCRIPT");
- }};
- // For information on tags & attributes, see:
- // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
- // http://www.w3schools.com/TAGS/
- private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
- put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
- put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
- put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
- put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
- put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
- put("map", attrSet("id", "class", "style", "title", "name"));
- put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
- put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
- "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
- put("param", attrSet("id", "name", "value", "valuetype", "type"));
- put("blockquote", attrSet("cite"));
- put("ins", attrSet("cite", "datetime"));
- put("del", attrSet("cite", "datetime"));
- put("q", attrSet("cite"));
-
- // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
- }};
-
- private static Set<String> attrSet(String... attrs) {
- Set<String> result = new HashSet<String>();
- for (String attr : attrs) {
- result.add(attr);
- }
- return result;
- }
-
- public String mapSafeElement(String name) {
- return SAFE_ELEMENTS.get(name);
- }
-
- /**
- * Normalizes an attribute name. Assumes that the element name
- * is valid and normalized
- */
- public String mapSafeAttribute(String elementName, String attributeName) {
- Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
- if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
- return attributeName;
- } else {
- return null;
- }
- }
-
- public boolean isDiscardElement(String name) {
- return DISCARDABLE_ELEMENTS.contains(name);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+ /**
+ * @since Apache Tika 0.8
+ */
+ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+ private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+ put("H1", "h1");
+ put("H2", "h2");
+ put("H3", "h3");
+ put("H4", "h4");
+ put("H5", "h5");
+ put("H6", "h6");
+
+ put("P", "p");
+ put("PRE", "pre");
+ put("BLOCKQUOTE", "blockquote");
+ put("Q", "q");
+
+ put("UL", "ul");
+ put("OL", "ol");
+ put("MENU", "ul");
+ put("LI", "li");
+ put("DL", "dl");
+ put("DT", "dt");
+ put("DD", "dd");
+
+ put("TABLE", "table");
+ put("THEAD", "thead");
+ put("TBODY", "tbody");
+ put("TR", "tr");
+ put("TH", "th");
+ put("TD", "td");
+
+ put("ADDRESS", "address");
+
+ // TIKA-460 - add anchors
+ put("A", "a");
+
+ // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+ put("MAP", "map");
+ put("AREA", "area");
+ put("IMG", "img");
+ put("FRAMESET", "frameset");
+ put("FRAME", "frame");
+ put("IFRAME", "iframe");
+ put("OBJECT", "object");
+ put("PARAM", "param");
+ put("INS", "ins");
+ put("DEL", "del");
+ }};
+ private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+ add("STYLE");
+ add("SCRIPT");
+ }};
+ // For information on tags & attributes, see:
+ // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+ // http://www.w3schools.com/TAGS/
+ private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+ put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+ put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+ put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+ put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+ put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+ put("map", attrSet("id", "class", "style", "title", "name"));
+ put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+ put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+ "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+ put("param", attrSet("id", "name", "value", "valuetype", "type"));
+ put("blockquote", attrSet("cite"));
+ put("ins", attrSet("cite", "datetime"));
+ put("del", attrSet("cite", "datetime"));
+ put("q", attrSet("cite"));
+
+ // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+ }};
+
+ private static Set<String> attrSet(String... attrs) {
+ Set<String> result = new HashSet<String>();
+ for (String attr : attrs) {
+ result.add(attr);
+ }
+ return result;
+ }
+
+ public String mapSafeElement(String name) {
+ return SAFE_ELEMENTS.get(name);
+ }
+
+ /**
+ * Normalizes an attribute name. Assumes that the element name
+ * is valid and normalized
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+ if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+ return attributeName;
+ } else {
+ return null;
+ }
+ }
+
+ public boolean isDiscardElement(String name) {
+ return DISCARDABLE_ELEMENTS.contains(name);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index c5bbc7a..d5dfaa6 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -1,309 +1,309 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-class HtmlHandler extends TextContentHandler {
-
- // List of attributes that need to be resolved.
- private static final Set<String> URI_ATTRIBUTES =
- new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
- private static final Pattern ICBM =
- Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
- private final HtmlMapper mapper;
- private final XHTMLContentHandler xhtml;
- private final Metadata metadata;
- private final StringBuilder title = new StringBuilder();
- private int bodyLevel = 0;
- private int discardLevel = 0;
- private int titleLevel = 0;
- private boolean isTitleSetToMetadata = false;
-
- private HtmlHandler(
- HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
- super(xhtml);
- this.mapper = mapper;
- this.xhtml = xhtml;
- this.metadata = metadata;
-
- // Try to determine the default base URL, if one has not been given
- if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
- name = name.trim();
- try {
- new URL(name); // test URL format
- metadata.set(Metadata.CONTENT_LOCATION, name);
- } catch (MalformedURLException e) {
- // The resource name is not a valid URL, ignore it
- }
- }
- }
- }
-
- public HtmlHandler(
- HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
- this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
- }
-
- @Override
- public void startElement(
- String uri, String local, String name, Attributes atts)
- throws SAXException {
- if ("TITLE".equals(name) || titleLevel > 0) {
- titleLevel++;
- }
- if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
- bodyLevel++;
- }
- if (mapper.isDiscardElement(name) || discardLevel > 0) {
- discardLevel++;
- }
-
- if (bodyLevel == 0 && discardLevel == 0) {
- if ("META".equals(name) && atts.getValue("content") != null) {
- // TIKA-478: For cases where we have either a name or
- // "http-equiv", assume that XHTMLContentHandler will emit
- // these in the <head>, thus passing them through safely.
- if (atts.getValue("http-equiv") != null) {
- addHtmlMetadata(
- atts.getValue("http-equiv"),
- atts.getValue("content"));
- } else if (atts.getValue("name") != null) {
- // Record the meta tag in the metadata
- addHtmlMetadata(
- atts.getValue("name"),
- atts.getValue("content"));
- } else if (atts.getValue("property") != null) {
- // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
- metadata.add(
- atts.getValue("property"),
- atts.getValue("content"));
- }
- } else if ("BASE".equals(name) && atts.getValue("href") != null) {
- startElementWithSafeAttributes("base", atts);
- xhtml.endElement("base");
- metadata.set(
- Metadata.CONTENT_LOCATION,
- resolve(atts.getValue("href")));
- } else if ("LINK".equals(name)) {
- startElementWithSafeAttributes("link", atts);
- xhtml.endElement("link");
- }
- }
-
- if (bodyLevel > 0 && discardLevel == 0) {
- String safe = mapper.mapSafeElement(name);
- if (safe != null) {
- startElementWithSafeAttributes(safe, atts);
- }
- }
-
- title.setLength(0);
- }
-
- /**
- * Adds a metadata setting from the HTML <head/> to the Tika metadata
- * object. The name and value are normalized where possible.
- */
- private void addHtmlMetadata(String name, String value) {
- if (name == null || value == null) {
- // ignore
- } else if (name.equalsIgnoreCase("ICBM")) {
- Matcher m = ICBM.matcher(value);
- if (m.matches()) {
- metadata.set("ICBM", m.group(1) + ", " + m.group(2));
- metadata.set(Metadata.LATITUDE, m.group(1));
- metadata.set(Metadata.LONGITUDE, m.group(2));
- } else {
- metadata.set("ICBM", value);
- }
- } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
- //don't overwrite Metadata.CONTENT_TYPE!
- MediaType type = MediaType.parse(value);
- if (type != null) {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
- } else {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
- }
- } else {
- metadata.add(name, value);
- }
- }
-
- private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
- if (atts.getLength() == 0) {
- xhtml.startElement(name);
- return;
- }
-
- boolean isObject = name.equals("object");
- String codebase = null;
- if (isObject) {
- codebase = atts.getValue("", "codebase");
- if (codebase != null) {
- codebase = resolve(codebase);
- } else {
- codebase = metadata.get(Metadata.CONTENT_LOCATION);
- }
- }
-
- AttributesImpl newAttributes = new AttributesImpl(atts);
- for (int att = 0; att < newAttributes.getLength(); att++) {
- String attrName = newAttributes.getLocalName(att);
- String normAttrName = mapper.mapSafeAttribute(name, attrName);
- if (normAttrName == null) {
- newAttributes.removeAttribute(att);
- att--;
- } else {
- // We have a remapped attribute name, so set it as it might have changed.
- newAttributes.setLocalName(att, normAttrName);
-
- // And resolve relative links. Eventually this should be pushed
- // into the HtmlMapper code.
- if (URI_ATTRIBUTES.contains(normAttrName)) {
- newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
- } else if (isObject && "codebase".equals(normAttrName)) {
- newAttributes.setValue(att, codebase);
- } else if (isObject
- && ("data".equals(normAttrName)
- || "classid".equals(normAttrName))) {
- newAttributes.setValue(
- att,
- resolve(codebase, newAttributes.getValue(att)));
- }
- }
- }
-
- if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
- newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
- }
-
- xhtml.startElement(name, newAttributes);
- }
-
- @Override
- public void endElement(
- String uri, String local, String name) throws SAXException {
- if (bodyLevel > 0 && discardLevel == 0) {
- String safe = mapper.mapSafeElement(name);
- if (safe != null) {
- xhtml.endElement(safe);
- } else if (XHTMLContentHandler.ENDLINE.contains(
- name.toLowerCase(Locale.ENGLISH))) {
- // TIKA-343: Replace closing block tags (and <br/>) with a
- // newline unless the HtmlMapper above has already mapped
- // them to something else
- xhtml.newline();
- }
- }
-
- if (titleLevel > 0) {
- titleLevel--;
- if (titleLevel == 0 && !isTitleSetToMetadata) {
- metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
- isTitleSetToMetadata = true;
- }
- }
- if (bodyLevel > 0) {
- bodyLevel--;
- }
- if (discardLevel > 0) {
- discardLevel--;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- if (titleLevel > 0 && bodyLevel == 0) {
- title.append(ch, start, length);
- }
- if (bodyLevel > 0 && discardLevel == 0) {
- super.characters(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length)
- throws SAXException {
- if (bodyLevel > 0 && discardLevel == 0) {
- super.ignorableWhitespace(ch, start, length);
- }
- }
-
- private String resolve(String url) {
- return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
- }
-
- private String resolve(String base, String url) {
- url = url.trim();
-
- // Return the URL as-is if no base URL is available or if the URL
- // matches a common non-hierarchical or pseudo URI prefix
- String lower = url.toLowerCase(Locale.ENGLISH);
- if (base == null
- || lower.startsWith("urn:")
- || lower.startsWith("mailto:")
- || lower.startsWith("tel:")
- || lower.startsWith("data:")
- || lower.startsWith("javascript:")
- || lower.startsWith("about:")) {
- return url;
- }
-
- try {
- URL baseURL = new URL(base.trim());
-
- // We need to handle one special case, where the relativeUrl is
- // just a query string (like "?pid=1"), and the baseUrl doesn't
- // end with a '/'. In that case, the URL class removes the last
- // portion of the path, which we don't want.
- String path = baseURL.getPath();
- if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
- return new URL(
- baseURL.getProtocol(),
- baseURL.getHost(), baseURL.getPort(),
- baseURL.getPath() + url).toExternalForm();
- } else {
- return new URL(baseURL, url).toExternalForm();
- }
- } catch (MalformedURLException e) {
- // Unknown or broken format; just return the URL as received.
- return url;
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+ // List of attributes that need to be resolved.
+ private static final Set<String> URI_ATTRIBUTES =
+ new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+ private static final Pattern ICBM =
+ Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+ private final HtmlMapper mapper;
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+ private final StringBuilder title = new StringBuilder();
+ private int bodyLevel = 0;
+ private int discardLevel = 0;
+ private int titleLevel = 0;
+ private boolean isTitleSetToMetadata = false;
+
+ private HtmlHandler(
+ HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+ super(xhtml);
+ this.mapper = mapper;
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+
+ // Try to determine the default base URL, if one has not been given
+ if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = name.trim();
+ try {
+ new URL(name); // test URL format
+ metadata.set(Metadata.CONTENT_LOCATION, name);
+ } catch (MalformedURLException e) {
+ // The resource name is not a valid URL, ignore it
+ }
+ }
+ }
+ }
+
+ public HtmlHandler(
+ HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+ this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes atts)
+ throws SAXException {
+ if ("TITLE".equals(name) || titleLevel > 0) {
+ titleLevel++;
+ }
+ if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+ bodyLevel++;
+ }
+ if (mapper.isDiscardElement(name) || discardLevel > 0) {
+ discardLevel++;
+ }
+
+ if (bodyLevel == 0 && discardLevel == 0) {
+ if ("META".equals(name) && atts.getValue("content") != null) {
+ // TIKA-478: For cases where we have either a name or
+ // "http-equiv", assume that XHTMLContentHandler will emit
+ // these in the <head>, thus passing them through safely.
+ if (atts.getValue("http-equiv") != null) {
+ addHtmlMetadata(
+ atts.getValue("http-equiv"),
+ atts.getValue("content"));
+ } else if (atts.getValue("name") != null) {
+ // Record the meta tag in the metadata
+ addHtmlMetadata(
+ atts.getValue("name"),
+ atts.getValue("content"));
+ } else if (atts.getValue("property") != null) {
+ // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+ metadata.add(
+ atts.getValue("property"),
+ atts.getValue("content"));
+ }
+ } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+ startElementWithSafeAttributes("base", atts);
+ xhtml.endElement("base");
+ metadata.set(
+ Metadata.CONTENT_LOCATION,
+ resolve(atts.getValue("href")));
+ } else if ("LINK".equals(name)) {
+ startElementWithSafeAttributes("link", atts);
+ xhtml.endElement("link");
+ }
+ }
+
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ startElementWithSafeAttributes(safe, atts);
+ }
+ }
+
+ title.setLength(0);
+ }
+
+ /**
+ * Adds a metadata setting from the HTML <head/> to the Tika metadata
+ * object. The name and value are normalized where possible.
+ */
+ private void addHtmlMetadata(String name, String value) {
+ if (name == null || value == null) {
+ // ignore
+ } else if (name.equalsIgnoreCase("ICBM")) {
+ Matcher m = ICBM.matcher(value);
+ if (m.matches()) {
+ metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+ metadata.set(Metadata.LATITUDE, m.group(1));
+ metadata.set(Metadata.LONGITUDE, m.group(2));
+ } else {
+ metadata.set("ICBM", value);
+ }
+ } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+ //don't overwrite Metadata.CONTENT_TYPE!
+ MediaType type = MediaType.parse(value);
+ if (type != null) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+ } else {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+
+ private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+ if (atts.getLength() == 0) {
+ xhtml.startElement(name);
+ return;
+ }
+
+ boolean isObject = name.equals("object");
+ String codebase = null;
+ if (isObject) {
+ codebase = atts.getValue("", "codebase");
+ if (codebase != null) {
+ codebase = resolve(codebase);
+ } else {
+ codebase = metadata.get(Metadata.CONTENT_LOCATION);
+ }
+ }
+
+ AttributesImpl newAttributes = new AttributesImpl(atts);
+ for (int att = 0; att < newAttributes.getLength(); att++) {
+ String attrName = newAttributes.getLocalName(att);
+ String normAttrName = mapper.mapSafeAttribute(name, attrName);
+ if (normAttrName == null) {
+ newAttributes.removeAttribute(att);
+ att--;
+ } else {
+ // We have a remapped attribute name, so set it as it might have changed.
+ newAttributes.setLocalName(att, normAttrName);
+
+ // And resolve relative links. Eventually this should be pushed
+ // into the HtmlMapper code.
+ if (URI_ATTRIBUTES.contains(normAttrName)) {
+ newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+ } else if (isObject && "codebase".equals(normAttrName)) {
+ newAttributes.setValue(att, codebase);
+ } else if (isObject
+ && ("data".equals(normAttrName)
+ || "classid".equals(normAttrName))) {
+ newAttributes.setValue(
+ att,
+ resolve(codebase, newAttributes.getValue(att)));
+ }
+ }
+ }
+
+ if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+ newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+ }
+
+ xhtml.startElement(name, newAttributes);
+ }
+
+ @Override
+ public void endElement(
+ String uri, String local, String name) throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ xhtml.endElement(safe);
+ } else if (XHTMLContentHandler.ENDLINE.contains(
+ name.toLowerCase(Locale.ENGLISH))) {
+ // TIKA-343: Replace closing block tags (and <br/>) with a
+ // newline unless the HtmlMapper above has already mapped
+ // them to something else
+ xhtml.newline();
+ }
+ }
+
+ if (titleLevel > 0) {
+ titleLevel--;
+ if (titleLevel == 0 && !isTitleSetToMetadata) {
+ metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+ isTitleSetToMetadata = true;
+ }
+ }
+ if (bodyLevel > 0) {
+ bodyLevel--;
+ }
+ if (discardLevel > 0) {
+ discardLevel--;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (titleLevel > 0 && bodyLevel == 0) {
+ title.append(ch, start, length);
+ }
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.ignorableWhitespace(ch, start, length);
+ }
+ }
+
+ private String resolve(String url) {
+ return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+ }
+
+ private String resolve(String base, String url) {
+ url = url.trim();
+
+ // Return the URL as-is if no base URL is available or if the URL
+ // matches a common non-hierarchical or pseudo URI prefix
+ String lower = url.toLowerCase(Locale.ENGLISH);
+ if (base == null
+ || lower.startsWith("urn:")
+ || lower.startsWith("mailto:")
+ || lower.startsWith("tel:")
+ || lower.startsWith("data:")
+ || lower.startsWith("javascript:")
+ || lower.startsWith("about:")) {
+ return url;
+ }
+
+ try {
+ URL baseURL = new URL(base.trim());
+
+ // We need to handle one special case, where the relativeUrl is
+ // just a query string (like "?pid=1"), and the baseUrl doesn't
+ // end with a '/'. In that case, the URL class removes the last
+ // portion of the path, which we don't want.
+ String path = baseURL.getPath();
+ if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+ return new URL(
+ baseURL.getProtocol(),
+ baseURL.getHost(), baseURL.getPort(),
+ baseURL.getPath() + url).toExternalForm();
+ } else {
+ return new URL(baseURL, url).toExternalForm();
+ }
+ } catch (MalformedURLException e) {
+ // Unknown or broken format; just return the URL as received.
+ return url;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
index 947d26a..1ca7434 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
- /**
- * Maps "safe" HTML element names to semantic XHTML equivalents. If the
- * given element is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the element
- * will be ignored but the content inside it is still processed. See
- * the {@link #isDiscardElement(String)} method for a way to discard
- * the entire contents of an element.
- *
- * @param name HTML element name (upper case)
- * @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
- */
- String mapSafeElement(String name);
-
- /**
- * Checks whether all content within the given HTML element should be
- * discarded instead of including it in the parse output.
- *
- * @param name HTML element name (upper case)
- * @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
- */
- boolean isDiscardElement(String name);
-
-
- /**
- * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
- * given attribute is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the attribute
- * will be ignored. This method assumes that the element name
- * is valid and normalised.
- *
- * @param elementName HTML element name (lower case)
- * @param attributeName HTML attribute name (lower case)
- * @return XHTML attribute name (lower case), or
- * <code>null</code> if the element is unsafe
- */
- String mapSafeAttribute(String elementName, String attributeName);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeElement(String name);
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ boolean isDiscardElement(String name);
+
+
+ /**
+ * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+ * given attribute is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the attribute
+ * will be ignored. This method assumes that the element name
+ * is valid and normalised.
+ *
+ * @param elementName HTML element name (lower case)
+ * @param attributeName HTML attribute name (lower case)
+ * @return XHTML attribute name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeAttribute(String elementName, String attributeName);
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 7d6f021..a9a8aa0 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -1,194 +1,194 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
- * and post-processes the events to produce XHTML and metadata expected by
- * Tika clients.
- */
-public class HtmlParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = 7895315240498733128L;
-
- private static final MediaType XHTML = MediaType.application("xhtml+xml");
- private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
- private static final MediaType X_ASP = MediaType.application("x-asp");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.text("html"),
- XHTML,
- WAP_XHTML,
- X_ASP)));
-
- private static final ServiceLoader LOADER =
- new ServiceLoader(HtmlParser.class.getClassLoader());
-
- /**
- * HTML schema singleton used to amortise the heavy instantiation time.
- */
- private static final Schema HTML_SCHEMA = new HTMLSchema();
-
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Automatically detect the character encoding
- try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
- metadata,context.get(ServiceLoader.class, LOADER))) {
- Charset charset = reader.getCharset();
- String previous = metadata.get(Metadata.CONTENT_TYPE);
- MediaType contentType = null;
- if (previous == null || previous.startsWith("text/html")) {
- contentType = new MediaType(MediaType.TEXT_HTML, charset);
- } else if (previous.startsWith("application/xhtml+xml")) {
- contentType = new MediaType(XHTML, charset);
- } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
- contentType = new MediaType(WAP_XHTML, charset);
- } else if (previous.startsWith("application/x-asp")) {
- contentType = new MediaType(X_ASP, charset);
- }
- if (contentType != null) {
- metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
- }
- // deprecated, see TIKA-431
- metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- // Get the HTML mapper from the parse context
- HtmlMapper mapper =
- context.get(HtmlMapper.class, new HtmlParserMapper());
-
- // Parse the HTML document
- org.ccil.cowan.tagsoup.Parser parser =
- new org.ccil.cowan.tagsoup.Parser();
-
- // Use schema from context or default
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
- // TIKA-528: Reuse share schema to avoid heavy instantiation
- parser.setProperty(
- org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- // TIKA-599: Shared schema is thread-safe only if bogons are ignored
- parser.setFeature(
- org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
-
- parser.setContentHandler(new XHTMLDowngradeHandler(
- new HtmlHandler(mapper, handler, metadata)));
-
- parser.parse(reader.asInputSource());
- }
- }
-
- /**
- * Maps "safe" HTML element names to semantic XHTML equivalents. If the
- * given element is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the element
- * will be ignored but the content inside it is still processed. See
- * the {@link #isDiscardElement(String)} method for a way to discard
- * the entire contents of an element.
- * <p/>
- * Subclasses can override this method to customize the default mapping.
- *
- * @param name HTML element name (upper case)
- * @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
- * @since Apache Tika 0.5
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- protected String mapSafeElement(String name) {
- return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
- }
-
- /**
- * Checks whether all content within the given HTML element should be
- * discarded instead of including it in the parse output. Subclasses
- * can override this method to customize the set of discarded elements.
- *
- * @param name HTML element name (upper case)
- * @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
- * @since Apache Tika 0.5
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- protected boolean isDiscardElement(String name) {
- return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
- }
-
- /**
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- public String mapSafeAttribute(String elementName, String attributeName) {
- return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
- }
-
- /**
- * Adapter class that maintains backwards compatibility with the
- * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
- * directly would require those methods to be public, which would break
- * backwards compatibility with subclasses.
- *
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This class will be removed in Tika 1.0.
- */
- private class HtmlParserMapper implements HtmlMapper {
- public String mapSafeElement(String name) {
- return HtmlParser.this.mapSafeElement(name);
- }
-
- public boolean isDiscardElement(String name) {
- return HtmlParser.this.isDiscardElement(name);
- }
-
- public String mapSafeAttribute(String elementName, String attributeName) {
- return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7895315240498733128L;
+
+ private static final MediaType XHTML = MediaType.application("xhtml+xml");
+ private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+ private static final MediaType X_ASP = MediaType.application("x-asp");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.text("html"),
+ XHTML,
+ WAP_XHTML,
+ X_ASP)));
+
+ private static final ServiceLoader LOADER =
+ new ServiceLoader(HtmlParser.class.getClassLoader());
+
+ /**
+ * HTML schema singleton used to amortise the heavy instantiation time.
+ */
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+ metadata,context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String previous = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType contentType = null;
+ if (previous == null || previous.startsWith("text/html")) {
+ contentType = new MediaType(MediaType.TEXT_HTML, charset);
+ } else if (previous.startsWith("application/xhtml+xml")) {
+ contentType = new MediaType(XHTML, charset);
+ } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+ contentType = new MediaType(WAP_XHTML, charset);
+ } else if (previous.startsWith("application/x-asp")) {
+ contentType = new MediaType(X_ASP, charset);
+ }
+ if (contentType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+ }
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ // Get the HTML mapper from the parse context
+ HtmlMapper mapper =
+ context.get(HtmlMapper.class, new HtmlParserMapper());
+
+ // Parse the HTML document
+ org.ccil.cowan.tagsoup.Parser parser =
+ new org.ccil.cowan.tagsoup.Parser();
+
+ // Use schema from context or default
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ // TIKA-528: Reuse share schema to avoid heavy instantiation
+ parser.setProperty(
+ org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+ parser.setFeature(
+ org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+ parser.setContentHandler(new XHTMLDowngradeHandler(
+ new HtmlHandler(mapper, handler, metadata)));
+
+ parser.parse(reader.asInputSource());
+ }
+ }
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ * <p/>
+ * Subclasses can override this method to customize the default mapping.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected String mapSafeElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+ }
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output. Subclasses
+ * can override this method to customize the set of discarded elements.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected boolean isDiscardElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+ }
+
+ /**
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+ }
+
+ /**
+ * Adapter class that maintains backwards compatibility with the
+ * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+ * directly would require those methods to be public, which would break
+ * backwards compatibility with subclasses.
+ *
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This class will be removed in Tika 1.0.
+ */
+ private class HtmlParserMapper implements HtmlMapper {
+ public String mapSafeElement(String name) {
+ return HtmlParser.this.mapSafeElement(name);
+ }
+
+ public boolean isDiscardElement(String name) {
+ return HtmlParser.this.isDiscardElement(name);
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+ }
+ }
+
+}
[28/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 1630edd..9d0a2f0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -1,398 +1,398 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Holds chm listing entries
- */
-public class ChmDirectoryListingSet {
- private List<DirectoryListingEntry> dlel;
- private byte[] data;
- private int placeHolder = -1;
- private long dataOffset = -1;
- private int controlDataIndex = -1;
- private int resetTableIndex = -1;
-
- private boolean isNotControlDataFound = true;
- private boolean isNotResetTableFound = true;
-
- /**
- * Constructs chm directory listing set
- *
- * @param data
- * byte[]
- * @param chmItsHeader
- * @param chmItspHeader
- * @throws TikaException
- */
- public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
- ChmItspHeader chmItspHeader) throws TikaException {
- setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
- ChmCommons.assertByteArrayNotNull(data);
- setData(data);
- enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("list:=" + getDirectoryListingEntryList().toString()
- + System.getProperty("line.separator"));
- sb.append("number of list items:="
- + getDirectoryListingEntryList().size());
- return sb.toString();
- }
-
- /**
- * Returns control data index that located in List
- *
- * @return control data index
- */
- public int getControlDataIndex() {
- return controlDataIndex;
- }
-
- /**
- * Sets control data index
- *
- * @param controlDataIndex
- */
- protected void setControlDataIndex(int controlDataIndex) {
- this.controlDataIndex = controlDataIndex;
- }
-
- /**
- * Return index of reset table
- *
- * @return reset table index
- */
- public int getResetTableIndex() {
- return resetTableIndex;
- }
-
- /**
- * Sets reset table index
- *
- * @param resetTableIndex
- */
- protected void setResetTableIndex(int resetTableIndex) {
- this.resetTableIndex = resetTableIndex;
- }
-
- /**
- * Sets place holder
- *
- * @param placeHolder
- */
- private void setPlaceHolder(int placeHolder) {
- this.placeHolder = placeHolder;
- }
-
- private ChmPmglHeader PMGLheader;
- /**
- * Enumerates chm directory listing entries
- *
- * @param chmItsHeader
- * chm itsf PMGLheader
- * @param chmItspHeader
- * chm itsp PMGLheader
- */
- private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
- ChmItspHeader chmItspHeader) {
- try {
- int startPmgl = chmItspHeader.getIndex_head();
- int stopPmgl = chmItspHeader.getUnknown_0024();
- int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
- .getHeader_len());
- setDataOffset(chmItsHeader.getDataOffset());
-
- /* loops over all pmgls */
- byte[] dir_chunk = null;
- for (int i = startPmgl; i>=0; ) {
- dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
- int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
- dir_chunk = ChmCommons
- .copyOfRange(getData(), start,
- start +(int) chmItspHeader.getBlock_len());
-
- PMGLheader = new ChmPmglHeader();
- PMGLheader.parse(dir_chunk, PMGLheader);
- enumerateOneSegment(dir_chunk);
-
- i=PMGLheader.getBlockNext();
- dir_chunk = null;
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- setData(null);
- }
- }
-
- /**
- * Checks control data
- *
- * @param dle
- * chm directory listing entry
- */
- private void checkControlData(DirectoryListingEntry dle) {
- if (isNotControlDataFound) {
- if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
- setControlDataIndex(getDirectoryListingEntryList().size());
- isNotControlDataFound = false;
- }
- }
- }
-
- /**
- * Checks reset table
- *
- * @param dle
- * chm directory listing entry
- */
- private void checkResetTable(DirectoryListingEntry dle) {
- if (isNotResetTableFound) {
- if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
- setResetTableIndex(getDirectoryListingEntryList().size());
- isNotResetTableFound = false;
- }
- }
- }
-
- public static final boolean startsWith(byte[] data, String prefix) {
- for (int i=0; i<prefix.length(); i++) {
- if (data[i]!=prefix.charAt(i)) {
- return false;
- }
- }
-
- return true;
- }
- /**
- * Enumerates chm directory listing entries in single chm segment
- *
- * @param dir_chunk
- */
- private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
-// try {
- if (dir_chunk != null) {
- int header_len;
- if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
- header_len = ChmConstants.CHM_PMGI_LEN;
- return; //skip PMGI
- }
- else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
- header_len = ChmConstants.CHM_PMGL_LEN;
- }
- else {
- throw new ChmParsingException("Bad dir entry block.");
- }
-
- placeHolder = header_len;
- //setPlaceHolder(header_len);
- while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
- /*&& dir_chunk[placeHolder - 1] != 115*/)
- {
- //get entry name length
- int strlen = 0;// = getEncint(data);
- byte temp;
- while ((temp=dir_chunk[placeHolder++]) >= 0x80)
- {
- strlen <<= 7;
- strlen += temp & 0x7f;
- }
-
- strlen = (strlen << 7) + temp & 0x7f;
-
- if (strlen>dir_chunk.length) {
- throw new ChmParsingException("Bad data of a string length.");
- }
-
- DirectoryListingEntry dle = new DirectoryListingEntry();
- dle.setNameLength(strlen);
- dle.setName(new String(ChmCommons.copyOfRange(
- dir_chunk, placeHolder,
- (placeHolder + dle.getNameLength())), UTF_8));
-
- checkControlData(dle);
- checkResetTable(dle);
- setPlaceHolder(placeHolder
- + dle.getNameLength());
-
- /* Sets entry type */
- if (placeHolder < dir_chunk.length
- && dir_chunk[placeHolder] == 0)
- dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
- else
- dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-
- setPlaceHolder(placeHolder + 1);
- dle.setOffset(getEncint(dir_chunk));
- dle.setLength(getEncint(dir_chunk));
- getDirectoryListingEntryList().add(dle);
- }
-
-// int indexWorkData = ChmCommons.indexOf(dir_chunk,
-// "::".getBytes(UTF_8));
-// int indexUserData = ChmCommons.indexOf(dir_chunk,
-// "/".getBytes(UTF_8));
-//
-// if (indexUserData>=0 && indexUserData < indexWorkData)
-// setPlaceHolder(indexUserData);
-// else if (indexWorkData>=0) {
-// setPlaceHolder(indexWorkData);
-// }
-// else {
-// setPlaceHolder(indexUserData);
-// }
-//
-// if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
-// && dir_chunk[placeHolder - 1] != 115) {// #{
-// do {
-// if (dir_chunk[placeHolder - 1] > 0) {
-// DirectoryListingEntry dle = new DirectoryListingEntry();
-//
-// // two cases: 1. when dir_chunk[placeHolder -
-// // 1] == 0x73
-// // 2. when dir_chunk[placeHolder + 1] == 0x2f
-// doNameCheck(dir_chunk, dle);
-//
-// // dle.setName(new
-// // String(Arrays.copyOfRange(dir_chunk,
-// // placeHolder, (placeHolder +
-// // dle.getNameLength()))));
-// dle.setName(new String(ChmCommons.copyOfRange(
-// dir_chunk, placeHolder,
-// (placeHolder + dle.getNameLength())), UTF_8));
-// checkControlData(dle);
-// checkResetTable(dle);
-// setPlaceHolder(placeHolder
-// + dle.getNameLength());
-//
-// /* Sets entry type */
-// if (placeHolder < dir_chunk.length
-// && dir_chunk[placeHolder] == 0)
-// dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
-// else
-// dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-//
-// setPlaceHolder(placeHolder + 1);
-// dle.setOffset(getEncint(dir_chunk));
-// dle.setLength(getEncint(dir_chunk));
-// getDirectoryListingEntryList().add(dle);
-// } else
-// setPlaceHolder(placeHolder + 1);
-//
-// } while (nextEntry(dir_chunk));
-// }
- }
-
-// } catch (Exception e) {
-// e.printStackTrace();
-// }
- }
-
-
- /**
- * Returns encrypted integer
- *
- * @param data_chunk
- *
- * @return
- */
- private int getEncint(byte[] data_chunk) {
- byte ob;
- BigInteger bi = BigInteger.ZERO;
- byte[] nb = new byte[1];
-
- if (placeHolder < data_chunk.length) {
- while ((ob = data_chunk[placeHolder]) < 0) {
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- setPlaceHolder(placeHolder + 1);
- }
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- setPlaceHolder(placeHolder + 1);
- }
- return bi.intValue();
- }
-
- /**
- * Sets chm directory listing entry list
- *
- * @param dlel
- * chm directory listing entry list
- */
- public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
- this.dlel = dlel;
- }
-
- /**
- * Returns chm directory listing entry list
- *
- * @return List<DirectoryListingEntry>
- */
- public List<DirectoryListingEntry> getDirectoryListingEntryList() {
- return dlel;
- }
-
- /**
- * Sets data
- *
- * @param data
- */
- private void setData(byte[] data) {
- this.data = data;
- }
-
- /**
- * Returns data
- *
- * @return
- */
- private byte[] getData() {
- return data;
- }
-
- /**
- * Sets data offset
- *
- * @param dataOffset
- */
- private void setDataOffset(long dataOffset) {
- this.dataOffset = dataOffset;
- }
-
- /**
- * Returns data offset
- *
- * @return dataOffset
- */
- public long getDataOffset() {
- return dataOffset;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+ private List<DirectoryListingEntry> dlel;
+ private byte[] data;
+ private int placeHolder = -1;
+ private long dataOffset = -1;
+ private int controlDataIndex = -1;
+ private int resetTableIndex = -1;
+
+ private boolean isNotControlDataFound = true;
+ private boolean isNotResetTableFound = true;
+
+ /**
+ * Constructs chm directory listing set
+ *
+ * @param data
+ * byte[]
+ * @param chmItsHeader
+ * @param chmItspHeader
+ * @throws TikaException
+ */
+ public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) throws TikaException {
+ setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+ ChmCommons.assertByteArrayNotNull(data);
+ setData(data);
+ enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("list:=" + getDirectoryListingEntryList().toString()
+ + System.getProperty("line.separator"));
+ sb.append("number of list items:="
+ + getDirectoryListingEntryList().size());
+ return sb.toString();
+ }
+
+ /**
+ * Returns control data index that located in List
+ *
+ * @return control data index
+ */
+ public int getControlDataIndex() {
+ return controlDataIndex;
+ }
+
+ /**
+ * Sets control data index
+ *
+ * @param controlDataIndex
+ */
+ protected void setControlDataIndex(int controlDataIndex) {
+ this.controlDataIndex = controlDataIndex;
+ }
+
+ /**
+ * Return index of reset table
+ *
+ * @return reset table index
+ */
+ public int getResetTableIndex() {
+ return resetTableIndex;
+ }
+
+ /**
+ * Sets reset table index
+ *
+ * @param resetTableIndex
+ */
+ protected void setResetTableIndex(int resetTableIndex) {
+ this.resetTableIndex = resetTableIndex;
+ }
+
+ /**
+ * Sets place holder
+ *
+ * @param placeHolder
+ */
+ private void setPlaceHolder(int placeHolder) {
+ this.placeHolder = placeHolder;
+ }
+
+ private ChmPmglHeader PMGLheader;
+ /**
+ * Enumerates chm directory listing entries
+ *
+ * @param chmItsHeader
+ * chm itsf PMGLheader
+ * @param chmItspHeader
+ * chm itsp PMGLheader
+ */
+ private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) {
+ try {
+ int startPmgl = chmItspHeader.getIndex_head();
+ int stopPmgl = chmItspHeader.getUnknown_0024();
+ int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+ .getHeader_len());
+ setDataOffset(chmItsHeader.getDataOffset());
+
+ /* loops over all pmgls */
+ byte[] dir_chunk = null;
+ for (int i = startPmgl; i>=0; ) {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
+ dir_chunk = ChmCommons
+ .copyOfRange(getData(), start,
+ start +(int) chmItspHeader.getBlock_len());
+
+ PMGLheader = new ChmPmglHeader();
+ PMGLheader.parse(dir_chunk, PMGLheader);
+ enumerateOneSegment(dir_chunk);
+
+ i=PMGLheader.getBlockNext();
+ dir_chunk = null;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ setData(null);
+ }
+ }
+
+ /**
+ * Checks control data
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkControlData(DirectoryListingEntry dle) {
+ if (isNotControlDataFound) {
+ if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+ setControlDataIndex(getDirectoryListingEntryList().size());
+ isNotControlDataFound = false;
+ }
+ }
+ }
+
+ /**
+ * Checks reset table
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkResetTable(DirectoryListingEntry dle) {
+ if (isNotResetTableFound) {
+ if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+ setResetTableIndex(getDirectoryListingEntryList().size());
+ isNotResetTableFound = false;
+ }
+ }
+ }
+
+ public static final boolean startsWith(byte[] data, String prefix) {
+ for (int i=0; i<prefix.length(); i++) {
+ if (data[i]!=prefix.charAt(i)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ /**
+ * Enumerates chm directory listing entries in single chm segment
+ *
+ * @param dir_chunk
+ */
+ private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+// try {
+ if (dir_chunk != null) {
+ int header_len;
+ if (startsWith(dir_chunk, ChmConstants.CHM_PMGI_MARKER)) {
+ header_len = ChmConstants.CHM_PMGI_LEN;
+ return; //skip PMGI
+ }
+ else if (startsWith(dir_chunk, ChmConstants.PMGL)) {
+ header_len = ChmConstants.CHM_PMGL_LEN;
+ }
+ else {
+ throw new ChmParsingException("Bad dir entry block.");
+ }
+
+ placeHolder = header_len;
+ //setPlaceHolder(header_len);
+ while (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+ /*&& dir_chunk[placeHolder - 1] != 115*/)
+ {
+ //get entry name length
+ int strlen = 0;// = getEncint(data);
+ byte temp;
+ while ((temp=dir_chunk[placeHolder++]) >= 0x80)
+ {
+ strlen <<= 7;
+ strlen += temp & 0x7f;
+ }
+
+ strlen = (strlen << 7) + temp & 0x7f;
+
+ if (strlen>dir_chunk.length) {
+ throw new ChmParsingException("Bad data of a string length.");
+ }
+
+ DirectoryListingEntry dle = new DirectoryListingEntry();
+ dle.setNameLength(strlen);
+ dle.setName(new String(ChmCommons.copyOfRange(
+ dir_chunk, placeHolder,
+ (placeHolder + dle.getNameLength())), UTF_8));
+
+ checkControlData(dle);
+ checkResetTable(dle);
+ setPlaceHolder(placeHolder
+ + dle.getNameLength());
+
+ /* Sets entry type */
+ if (placeHolder < dir_chunk.length
+ && dir_chunk[placeHolder] == 0)
+ dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+ else
+ dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+ setPlaceHolder(placeHolder + 1);
+ dle.setOffset(getEncint(dir_chunk));
+ dle.setLength(getEncint(dir_chunk));
+ getDirectoryListingEntryList().add(dle);
+ }
+
+// int indexWorkData = ChmCommons.indexOf(dir_chunk,
+// "::".getBytes(UTF_8));
+// int indexUserData = ChmCommons.indexOf(dir_chunk,
+// "/".getBytes(UTF_8));
+//
+// if (indexUserData>=0 && indexUserData < indexWorkData)
+// setPlaceHolder(indexUserData);
+// else if (indexWorkData>=0) {
+// setPlaceHolder(indexWorkData);
+// }
+// else {
+// setPlaceHolder(indexUserData);
+// }
+//
+// if (placeHolder > 0 && placeHolder < dir_chunk.length - PMGLheader.getFreeSpace()
+// && dir_chunk[placeHolder - 1] != 115) {// #{
+// do {
+// if (dir_chunk[placeHolder - 1] > 0) {
+// DirectoryListingEntry dle = new DirectoryListingEntry();
+//
+// // two cases: 1. when dir_chunk[placeHolder -
+// // 1] == 0x73
+// // 2. when dir_chunk[placeHolder + 1] == 0x2f
+// doNameCheck(dir_chunk, dle);
+//
+// // dle.setName(new
+// // String(Arrays.copyOfRange(dir_chunk,
+// // placeHolder, (placeHolder +
+// // dle.getNameLength()))));
+// dle.setName(new String(ChmCommons.copyOfRange(
+// dir_chunk, placeHolder,
+// (placeHolder + dle.getNameLength())), UTF_8));
+// checkControlData(dle);
+// checkResetTable(dle);
+// setPlaceHolder(placeHolder
+// + dle.getNameLength());
+//
+// /* Sets entry type */
+// if (placeHolder < dir_chunk.length
+// && dir_chunk[placeHolder] == 0)
+// dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+// else
+// dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+//
+// setPlaceHolder(placeHolder + 1);
+// dle.setOffset(getEncint(dir_chunk));
+// dle.setLength(getEncint(dir_chunk));
+// getDirectoryListingEntryList().add(dle);
+// } else
+// setPlaceHolder(placeHolder + 1);
+//
+// } while (nextEntry(dir_chunk));
+// }
+ }
+
+// } catch (Exception e) {
+// e.printStackTrace();
+// }
+ }
+
+
+ /**
+ * Returns encrypted integer
+ *
+ * @param data_chunk
+ *
+ * @return
+ */
+ private int getEncint(byte[] data_chunk) {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+
+ if (placeHolder < data_chunk.length) {
+ while ((ob = data_chunk[placeHolder]) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(placeHolder + 1);
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(placeHolder + 1);
+ }
+ return bi.intValue();
+ }
+
+ /**
+ * Sets chm directory listing entry list
+ *
+ * @param dlel
+ * chm directory listing entry list
+ */
+ public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+ this.dlel = dlel;
+ }
+
+ /**
+ * Returns chm directory listing entry list
+ *
+ * @return List<DirectoryListingEntry>
+ */
+ public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+ return dlel;
+ }
+
+ /**
+ * Sets data
+ *
+ * @param data
+ */
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ /**
+ * Returns data
+ *
+ * @return
+ */
+ private byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param dataOffset
+ */
+ private void setDataOffset(long dataOffset) {
+ this.dataOffset = dataOffset;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return dataOffset
+ */
+ public long getDataOffset() {
+ return dataOffset;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
index a231e14..2c4dc4e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
@@ -1,492 +1,492 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
- * Total header length, including header section table and following data. 000C:
- * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
- * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
- * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
- * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
- * beginning of file 0008: QWORD Length of section Following the header section
- * table is 8 bytes of additional header data. In Version 2 files, this data is
- * not there and the content section starts immediately after the directory.
- *
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1}
- *
- */
-/* structure of ITSF headers */
-public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
- private static final long serialVersionUID = 2215291838533213826L;
- private byte[] signature;
- private int version; /* 4 */
- private int header_len; /* 8 */
- private int unknown_000c; /* c */
- private long last_modified; /* 10 */
- private long lang_id; /* 14 */
- private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
- private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
- private long unknown_offset; /* 38 */
- private long unknown_len; /* 40 */
- private long dir_offset; /* 48 */
- private long dir_len; /* 50 */
- private long data_offset; /* 58 (Not present before V3) */
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- public ChmItsfHeader() {
- signature = ChmConstants.ITSF.getBytes(UTF_8); /* 0 (ITSF) */
- }
-
- /**
- * Prints the values of ChmfHeader
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(new String(getSignature(), UTF_8) + " ");
- sb.append(getVersion() + " ");
- sb.append(getHeaderLen() + " ");
- sb.append(getUnknown_000c() + " ");
- sb.append(getLastModified() + " ");
- sb.append(getLangId() + " ");
- sb.append(getDir_uuid() + " ");
- sb.append(getStream_uuid() + " ");
- sb.append(getUnknownOffset() + " ");
- sb.append(getUnknownLen() + " ");
- sb.append(getDirOffset() + " ");
- sb.append(getDirLen() + " ");
- sb.append(getDataOffset() + " ");
- return sb.toString();
- }
-
- /**
- * Returns a signature of itsf header
- *
- * @return itsf header
- */
- public byte[] getSignature() {
- return signature;
- }
-
- /**
- * Sets itsf header signature
- *
- * @param signature
- */
- protected void setSignature(byte[] signature) {
- this.signature = signature;
- }
-
- /**
- * Returns itsf header version
- *
- * @return itsf version
- */
- public int getVersion() {
- return version;
- }
-
- /**
- * Sets itsf version
- *
- * @param version
- */
- protected void setVersion(int version) {
- this.version = version;
- }
-
- /**
- * Returns itsf header length
- *
- * @return length
- */
- public int getHeaderLen() {
- return header_len;
- }
-
- /**
- * Sets itsf header length
- *
- * @param header_len
- */
- protected void setHeaderLen(int header_len) {
- this.header_len = header_len;
- }
-
- /**
- * Returns unknown_00c value
- *
- * @return unknown_00c
- */
- public int getUnknown_000c() {
- return unknown_000c;
- }
-
- /**
- * Sets unknown_00c
- *
- * @param unknown_000c
- */
- protected void setUnknown_000c(int unknown_000c) {
- this.unknown_000c = unknown_000c;
- }
-
- /**
- * Returns last modified date of the chm file
- *
- * @return last modified date as long
- */
- public long getLastModified() {
- return last_modified;
- }
-
- /**
- * Sets last modified date of the chm file
- *
- * @param last_modified
- */
- protected void setLastModified(long last_modified) {
- this.last_modified = last_modified;
- }
-
- /**
- * Returns language ID
- *
- * @return language_id
- */
- public long getLangId() {
- return lang_id;
- }
-
- /**
- * Sets language_id
- *
- * @param lang_id
- */
- protected void setLangId(long lang_id) {
- this.lang_id = lang_id;
- }
-
- /**
- * Returns directory uuid
- *
- * @return dir_uuid
- */
- public byte[] getDir_uuid() {
- return dir_uuid;
- }
-
- /**
- * Sets directory uuid
- *
- * @param dir_uuid
- */
- protected void setDir_uuid(byte[] dir_uuid) {
- this.dir_uuid = dir_uuid;
- }
-
- /**
- * Returns stream uuid
- *
- * @return stream_uuid
- */
- public byte[] getStream_uuid() {
- return stream_uuid;
- }
-
- /**
- * Sets stream uuid
- *
- * @param stream_uuid
- */
- protected void setStream_uuid(byte[] stream_uuid) {
- this.stream_uuid = stream_uuid;
- }
-
- /**
- * Returns unknown offset
- *
- * @return unknown_offset
- */
- public long getUnknownOffset() {
- return unknown_offset;
- }
-
- /**
- * Sets unknown offset
- *
- * @param unknown_offset
- */
- protected void setUnknownOffset(long unknown_offset) {
- this.unknown_offset = unknown_offset;
- }
-
- /**
- * Returns unknown length
- *
- * @return unknown_length
- */
- public long getUnknownLen() {
- return unknown_len;
- }
-
- /**
- * Sets unknown length
- *
- * @param unknown_len
- */
- protected void setUnknownLen(long unknown_len) {
- this.unknown_len = unknown_len;
- }
-
- /**
- * Returns directory offset
- *
- * @return directory_offset
- */
- public long getDirOffset() {
- return dir_offset;
- }
-
- /**
- * Sets directory offset
- *
- * @param dir_offset
- */
- protected void setDirOffset(long dir_offset) {
- this.dir_offset = dir_offset;
- }
-
- /**
- * Returns directory length
- *
- * @return directory_offset
- */
- public long getDirLen() {
- return dir_len;
- }
-
- /**
- * Sets directory length
- *
- * @param dir_len
- */
- protected void setDirLen(long dir_len) {
- this.dir_len = dir_len;
- }
-
- /**
- * Returns data offset
- *
- * @return data_offset
- */
- public long getDataOffset() {
- return data_offset;
- }
-
- /**
- * Sets data offset
- *
- * @param data_offset
- */
- protected void setDataOffset(long data_offset) {
- this.data_offset = data_offset;
- }
-
- /**
- * Copies 4 first bytes of the byte[]
- *
- * @param data
- * @param chmItsfHeader
- * @param count
- * @throws TikaException
- */
- private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
- int count) throws TikaException {
- ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
- System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- }
-
- /**
- * Copies X bytes of source byte[] to the dest byte[]
- *
- * @param data
- * @param dest
- * @param count
- * @return
- */
- private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
- System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- return dest;
- }
-
- /**
- * Takes 8 bytes and reverses them
- *
- * @param data
- * @param dest
- * @return
- * @throws TikaException
- */
- private long unmarshalUint64(byte[] data, long dest) throws TikaException{
- byte[] temp = new byte[8];
- int i, j;
-
- if (8 > this.getDataRemained())
- throw new TikaException("8 > this.getDataRemained()");
-
- for (i = 8, j = 7; i > 0; i--) {
- temp[j--] = data[this.getCurrentPlace()];
- this.setCurrentPlace(this.getCurrentPlace() + 1);
- }
-
- dest = new BigInteger(temp).longValue();
- this.setDataRemained(this.getDataRemained() - 8);
- return dest;
- }
-
- private int unmarshalInt32(byte[] data, int dest) throws TikaException{
- ChmAssert.assertByteArrayNotNull(data);
-
- if (4 > this.getDataRemained())
- throw new TikaException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- this.setDataRemained(this.getDataRemained() - 4);
- return dest;
- }
-
- private long unmarshalUInt32(byte[] data, long dest) throws TikaException{
- ChmAssert.assertByteArrayNotNull(data);
- if (4 > getDataRemained())
- throw new TikaException("4 > dataLenght");
- dest = data[this.getCurrentPlace()]
- | data[this.getCurrentPlace() + 1] << 8
- | data[this.getCurrentPlace() + 2] << 16
- | data[this.getCurrentPlace() + 3] << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- public static void main(String[] args) {
- }
-
- /**
- * Sets data remained to be processed
- *
- * @param dataRemained
- */
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- /**
- * Returns data remained
- *
- * @return data_remainned
- */
- private int getDataRemained() {
- return dataRemained;
- }
-
- /**
- * Sets current place in the byte[]
- *
- * @param currentPlace
- */
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- /**
- * Returns current place in the byte[]
- *
- * @return current place
- */
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- // @Override
- public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
- if (data.length < ChmConstants.CHM_ITSF_V2_LEN
- || data.length > ChmConstants.CHM_ITSF_V3_LEN)
- throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
-
- chmItsfHeader.setDataRemained(data.length);
- chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
- chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
- chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
- chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
- chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
- chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
- chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
- chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
- chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
- chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
- chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
- chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
- if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF))
- throw new TikaException("seems not valid file");
- if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
- if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
- throw new TikaException("something wrong with header");
- } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
- if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
- throw new TikaException("unknown v3 header lenght");
- } else
- throw new ChmParsingException("unsupported chm format");
-
- /*
- * now, if we have a V3 structure, unmarshal the rest, otherwise,
- * compute it
- */
- if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
- if (chmItsfHeader.getDataRemained() >= 0)
- chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
- + chmItsfHeader.getDirLen());
- else
- throw new TikaException("cannot set data offset, no data remained");
- } else
- chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
- + chmItsfHeader.getDirLen());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
+ * Total header length, including header section table and following data. 000C:
+ * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
+ * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
+ * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
+ * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
+ * beginning of file 0008: QWORD Length of section Following the header section
+ * table is 8 bytes of additional header data. In Version 2 files, this data is
+ * not there and the content section starts immediately after the directory.
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ *
+ */
+/* structure of ITSF headers */
+public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
+ private static final long serialVersionUID = 2215291838533213826L;
+ private byte[] signature;
+ private int version; /* 4 */
+ private int header_len; /* 8 */
+ private int unknown_000c; /* c */
+ private long last_modified; /* 10 */
+ private long lang_id; /* 14 */
+ private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
+ private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
+ private long unknown_offset; /* 38 */
+ private long unknown_len; /* 40 */
+ private long dir_offset; /* 48 */
+ private long dir_len; /* 50 */
+ private long data_offset; /* 58 (Not present before V3) */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public ChmItsfHeader() {
+ signature = ChmConstants.ITSF.getBytes(UTF_8); /* 0 (ITSF) */
+ }
+
+ /**
+ * Prints the values of ChmfHeader
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(new String(getSignature(), UTF_8) + " ");
+ sb.append(getVersion() + " ");
+ sb.append(getHeaderLen() + " ");
+ sb.append(getUnknown_000c() + " ");
+ sb.append(getLastModified() + " ");
+ sb.append(getLangId() + " ");
+ sb.append(getDir_uuid() + " ");
+ sb.append(getStream_uuid() + " ");
+ sb.append(getUnknownOffset() + " ");
+ sb.append(getUnknownLen() + " ");
+ sb.append(getDirOffset() + " ");
+ sb.append(getDirLen() + " ");
+ sb.append(getDataOffset() + " ");
+ return sb.toString();
+ }
+
+ /**
+ * Returns a signature of itsf header
+ *
+ * @return itsf header
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets itsf header signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns itsf header version
+ *
+ * @return itsf version
+ */
+ public int getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets itsf version
+ *
+ * @param version
+ */
+ protected void setVersion(int version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns itsf header length
+ *
+ * @return length
+ */
+ public int getHeaderLen() {
+ return header_len;
+ }
+
+ /**
+ * Sets itsf header length
+ *
+ * @param header_len
+ */
+ protected void setHeaderLen(int header_len) {
+ this.header_len = header_len;
+ }
+
+ /**
+ * Returns unknown_00c value
+ *
+ * @return unknown_00c
+ */
+ public int getUnknown_000c() {
+ return unknown_000c;
+ }
+
+ /**
+ * Sets unknown_00c
+ *
+ * @param unknown_000c
+ */
+ protected void setUnknown_000c(int unknown_000c) {
+ this.unknown_000c = unknown_000c;
+ }
+
+ /**
+ * Returns last modified date of the chm file
+ *
+ * @return last modified date as long
+ */
+ public long getLastModified() {
+ return last_modified;
+ }
+
+ /**
+ * Sets last modified date of the chm file
+ *
+ * @param last_modified
+ */
+ protected void setLastModified(long last_modified) {
+ this.last_modified = last_modified;
+ }
+
+ /**
+ * Returns language ID
+ *
+ * @return language_id
+ */
+ public long getLangId() {
+ return lang_id;
+ }
+
+ /**
+ * Sets language_id
+ *
+ * @param lang_id
+ */
+ protected void setLangId(long lang_id) {
+ this.lang_id = lang_id;
+ }
+
+ /**
+ * Returns directory uuid
+ *
+ * @return dir_uuid
+ */
+ public byte[] getDir_uuid() {
+ return dir_uuid;
+ }
+
+ /**
+ * Sets directory uuid
+ *
+ * @param dir_uuid
+ */
+ protected void setDir_uuid(byte[] dir_uuid) {
+ this.dir_uuid = dir_uuid;
+ }
+
+ /**
+ * Returns stream uuid
+ *
+ * @return stream_uuid
+ */
+ public byte[] getStream_uuid() {
+ return stream_uuid;
+ }
+
+ /**
+ * Sets stream uuid
+ *
+ * @param stream_uuid
+ */
+ protected void setStream_uuid(byte[] stream_uuid) {
+ this.stream_uuid = stream_uuid;
+ }
+
+ /**
+ * Returns unknown offset
+ *
+ * @return unknown_offset
+ */
+ public long getUnknownOffset() {
+ return unknown_offset;
+ }
+
+ /**
+ * Sets unknown offset
+ *
+ * @param unknown_offset
+ */
+ protected void setUnknownOffset(long unknown_offset) {
+ this.unknown_offset = unknown_offset;
+ }
+
+ /**
+ * Returns unknown length
+ *
+ * @return unknown_length
+ */
+ public long getUnknownLen() {
+ return unknown_len;
+ }
+
+ /**
+ * Sets unknown length
+ *
+ * @param unknown_len
+ */
+ protected void setUnknownLen(long unknown_len) {
+ this.unknown_len = unknown_len;
+ }
+
+ /**
+ * Returns directory offset
+ *
+ * @return directory_offset
+ */
+ public long getDirOffset() {
+ return dir_offset;
+ }
+
+ /**
+ * Sets directory offset
+ *
+ * @param dir_offset
+ */
+ protected void setDirOffset(long dir_offset) {
+ this.dir_offset = dir_offset;
+ }
+
+ /**
+ * Returns directory length
+ *
+ * @return directory_offset
+ */
+ public long getDirLen() {
+ return dir_len;
+ }
+
+ /**
+ * Sets directory length
+ *
+ * @param dir_len
+ */
+ protected void setDirLen(long dir_len) {
+ this.dir_len = dir_len;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return data_offset
+ */
+ public long getDataOffset() {
+ return data_offset;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param data_offset
+ */
+ protected void setDataOffset(long data_offset) {
+ this.data_offset = data_offset;
+ }
+
+ /**
+ * Copies 4 first bytes of the byte[]
+ *
+ * @param data
+ * @param chmItsfHeader
+ * @param count
+ * @throws TikaException
+ */
+ private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
+ int count) throws TikaException {
+ ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
+ System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ /**
+ * Copies X bytes of source byte[] to the dest byte[]
+ *
+ * @param data
+ * @param dest
+ * @param count
+ * @return
+ */
+ private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
+ System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ return dest;
+ }
+
+ /**
+ * Takes 8 bytes and reverses them
+ *
+ * @param data
+ * @param dest
+ * @return
+ * @throws TikaException
+ */
+ private long unmarshalUint64(byte[] data, long dest) throws TikaException{
+ byte[] temp = new byte[8];
+ int i, j;
+
+ if (8 > this.getDataRemained())
+ throw new TikaException("8 > this.getDataRemained()");
+
+ for (i = 8, j = 7; i > 0; i--) {
+ temp[j--] = data[this.getCurrentPlace()];
+ this.setCurrentPlace(this.getCurrentPlace() + 1);
+ }
+
+ dest = new BigInteger(temp).longValue();
+ this.setDataRemained(this.getDataRemained() - 8);
+ return dest;
+ }
+
+ private int unmarshalInt32(byte[] data, int dest) throws TikaException{
+ ChmAssert.assertByteArrayNotNull(data);
+
+ if (4 > this.getDataRemained())
+ throw new TikaException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) throws TikaException{
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > getDataRemained())
+ throw new TikaException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Sets data remained to be processed
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns data remained
+ *
+ * @return data_remainned
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets current place in the byte[]
+ *
+ * @param currentPlace
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns current place in the byte[]
+ *
+ * @return current place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
+ if (data.length < ChmConstants.CHM_ITSF_V2_LEN
+ || data.length > ChmConstants.CHM_ITSF_V3_LEN)
+ throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
+
+ chmItsfHeader.setDataRemained(data.length);
+ chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
+ chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
+ chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
+ chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
+ chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
+ chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
+ chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
+ chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
+ chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
+ chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
+ chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
+ chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
+ if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF))
+ throw new TikaException("seems not valid file");
+ if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
+ if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
+ throw new TikaException("something wrong with header");
+ } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+ if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
+ throw new TikaException("unknown v3 header lenght");
+ } else
+ throw new ChmParsingException("unsupported chm format");
+
+ /*
+ * now, if we have a V3 structure, unmarshal the rest, otherwise,
+ * compute it
+ */
+ if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+ if (chmItsfHeader.getDataRemained() >= 0)
+ chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+ + chmItsfHeader.getDirLen());
+ else
+ throw new TikaException("cannot set data offset, no data remained");
+ } else
+ chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+ + chmItsfHeader.getDirLen());
+ }
+}
[38/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
index ea0d195..24deb86 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
@@ -1,63 +1,63 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.utils;
-
-import static org.junit.Assert.*;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Before;
-import org.junit.Test;
-
-public class ConcurrentUtilsTest {
-
- @Test
- public void testExecuteThread() throws Exception {
- ParseContext context = new ParseContext();
- Future result = ConcurrentUtils.execute(context, new Runnable() {
-
- @Override
- public void run() {
- //Do nothing
-
- }
- });
-
- assertNull(result.get());
- }
-
- @Test
- public void testExecuteExecutor() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- ParseContext context = new ParseContext();
- context.set(ExecutorService.class, config.getExecutorService());
- Future result = ConcurrentUtils.execute(context, new Runnable() {
-
- @Override
- public void run() {
- //Do nothing
-
- }
- });
-
- assertNull(result.get());
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import static org.junit.Assert.*;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ConcurrentUtilsTest {
+
+ @Test
+ public void testExecuteThread() throws Exception {
+ ParseContext context = new ParseContext();
+ Future result = ConcurrentUtils.execute(context, new Runnable() {
+
+ @Override
+ public void run() {
+ //Do nothing
+
+ }
+ });
+
+ assertNull(result.get());
+ }
+
+ @Test
+ public void testExecuteExecutor() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ ParseContext context = new ParseContext();
+ context.set(ExecutorService.class, config.getExecutorService());
+ Future result = ConcurrentUtils.execute(context, new Runnable() {
+
+ @Override
+ public void run() {
+ //Do nothing
+
+ }
+ });
+
+ assertNull(result.get());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
index a389f33..9f69aed 100644
--- a/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-core/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -1,17 +1,17 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.parser.external.CompositeExternalParser
-org.apache.tika.parser.mock.MockParser
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.mock.MockParser
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
index 2db72d1..15551f3 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1762-executors.xml
@@ -1,28 +1,28 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <service-loader dynamic="true"/>
- <parsers>
- <parser class="org.apache.tika.config.DummyParser"/>
- </parsers>
- <executor-service class="org.apache.tika.config.DummyExecutor">
- <core-threads>3</core-threads>
- <max-threads>10</max-threads>
- </executor-service>
-</properties>
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <service-loader dynamic="true"/>
+ <parsers>
+ <parser class="org.apache.tika.config.DummyParser"/>
+ </parsers>
+ <executor-service class="org.apache.tika.config.DummyExecutor">
+ <core-threads>3</core-threads>
+ <max-threads>10</max-threads>
+ </executor-service>
+</properties>
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/pom.xml b/tika-parser-bundles/pom.xml
index 4da0c98..bcaf4d1 100644
--- a/tika-parser-bundles/pom.xml
+++ b/tika-parser-bundles/pom.xml
@@ -1,176 +1,176 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>2.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
- </parent>
-
- <artifactId>tika-parser-bundles</artifactId>
- <packaging>pom</packaging>
- <name>Apache Tika Parser Bundles</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <poi.version>3.13</poi.version>
- <!-- NOTE: sync codec version with POI -->
- <codec.version>1.9</codec.version>
- <pdfbox.version>1.8.10</pdfbox.version>
- </properties>
-
- <modules>
- <module>tika-parser-advanced-bundle</module>
- <module>tika-parser-cad-bundle</module>
- <module>tika-parser-code-bundle</module>
- <module>tika-parser-crypto-bundle</module>
- <module>tika-parser-database-bundle</module>
- <module>tika-parser-ebook-bundle</module>
- <module>tika-parser-journal-bundle</module>
- <module>tika-parser-multimedia-bundle</module>
- <module>tika-parser-office-bundle</module>
- <module>tika-parser-package-bundle</module>
- <module>tika-parser-pdf-bundle</module>
- <module>tika-parser-scientific-bundle</module>
- <module>tika-parser-text-bundle</module>
- <module>tika-parser-web-bundle</module>
- </modules>
-
- <dependencies>
- <!-- Optional OSGi dependencies, used only when running within OSGi -->
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.core</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.compendium</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <!-- Test dependencies -->
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <version>1.7</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-junit4</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-container-native</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.framework</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-link-assembly</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.url</groupId>
- <artifactId>pax-url-aether</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-simple</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>javax.inject</groupId>
- <artifactId>javax.inject</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- </dependencies>
- <build>
- <pluginManagement>
- <plugins>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>integration-test</goal>
- <goal>verify</goal>
- </goals>
- </execution>
- </executions>
- <configuration>
- <systemPropertyVariables>
- <org.ops4j.pax.logging.DefaultServiceLog.level>
- WARN
- </org.ops4j.pax.logging.DefaultServiceLog.level>
- </systemPropertyVariables>
- <systemProperties>
- <property>
- <name>project.bundle.file</name>
- <value>target/${project.build.finalName}.jar</value>
- </property>
- </systemProperties>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <executions>
- <execution>
- <phase>pre-integration-test</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <descriptor>test-bundles.xml</descriptor>
- <finalName>test</finalName>
- <attach>false</attach>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-parser-bundles</artifactId>
+ <packaging>pom</packaging>
+ <name>Apache Tika Parser Bundles</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <poi.version>3.13</poi.version>
+ <!-- NOTE: sync codec version with POI -->
+ <codec.version>1.9</codec.version>
+ <pdfbox.version>1.8.10</pdfbox.version>
+ </properties>
+
+ <modules>
+ <module>tika-parser-advanced-bundle</module>
+ <module>tika-parser-cad-bundle</module>
+ <module>tika-parser-code-bundle</module>
+ <module>tika-parser-crypto-bundle</module>
+ <module>tika-parser-database-bundle</module>
+ <module>tika-parser-ebook-bundle</module>
+ <module>tika-parser-journal-bundle</module>
+ <module>tika-parser-multimedia-bundle</module>
+ <module>tika-parser-office-bundle</module>
+ <module>tika-parser-package-bundle</module>
+ <module>tika-parser-pdf-bundle</module>
+ <module>tika-parser-scientific-bundle</module>
+ <module>tika-parser-text-bundle</module>
+ <module>tika-parser-web-bundle</module>
+ </modules>
+
+ <dependencies>
+ <!-- Optional OSGi dependencies, used only when running within OSGi -->
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.core</artifactId>
+ <scope>provided</scope>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.compendium</artifactId>
+ <scope>provided</scope>
+ <optional>true</optional>
+ </dependency>
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.7</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-junit4</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-container-native</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.framework</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-link-assembly</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.url</groupId>
+ <artifactId>pax-url-aether</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.inject</groupId>
+ <artifactId>javax.inject</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <systemPropertyVariables>
+ <org.ops4j.pax.logging.DefaultServiceLog.level>
+ WARN
+ </org.ops4j.pax.logging.DefaultServiceLog.level>
+ </systemPropertyVariables>
+ <systemProperties>
+ <property>
+ <name>project.bundle.file</name>
+ <value>target/${project.build.finalName}.jar</value>
+ </property>
+ </systemProperties>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>pre-integration-test</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptor>test-bundles.xml</descriptor>
+ <finalName>test</finalName>
+ <attach>false</attach>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
index 28713fa..9fd0c77 100644
--- a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml
@@ -1,82 +1,82 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-advanced-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser advanced bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-advanced-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.advanced.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-advanced-module;inline=true,
- opennlp-tools;inline=true,
- opennlp-maxent;inline=true,
- commons-io;inline=true,
- jwnl;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.ner.*,
- org.apache.tika.parser.ner.corenlp.*,
- org.apache.tika.parser.ner.opennlp.*,
- org.apache.tika.parser.ner.regex.*
- </Export-Package>
- <Import-Package>
- *,
- opennlp.maxent;resolution:=optional,
- opennlp.tools.namefind;resolution:=optional,
- org.json;resolution:=optional,
- org.osgi.framework;resolution:=optional,
- net.didion.jwnl;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-advanced-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser advanced bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-advanced-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.advanced.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-advanced-module;inline=true,
+ opennlp-tools;inline=true,
+ opennlp-maxent;inline=true,
+ commons-io;inline=true,
+ jwnl;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.ner.*,
+ org.apache.tika.parser.ner.corenlp.*,
+ org.apache.tika.parser.ner.opennlp.*,
+ org.apache.tika.parser.ner.regex.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ opennlp.maxent;resolution:=optional,
+ opennlp.tools.namefind;resolution:=optional,
+ org.json;resolution:=optional,
+ org.osgi.framework;resolution:=optional,
+ net.didion.jwnl;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-cad-bundle/pom.xml b/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
index 8570abe..3bd09cc 100644
--- a/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-cad-bundle/pom.xml
@@ -1,73 +1,73 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-cad-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser cad bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-cad-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.cad.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-cad-module;inline=true,
- commons-io;inline=true,
- commons-codec;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.dwg.*,
- org.apache.tika.parser.prt.*
- </Export-Package>
- <Import-Package>
- *
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-cad-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser cad bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-cad-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.cad.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-cad-module;inline=true,
+ commons-io;inline=true,
+ commons-codec;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.dwg.*,
+ org.apache.tika.parser.prt.*
+ </Export-Package>
+ <Import-Package>
+ *
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-code-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-code-bundle/pom.xml b/tika-parser-bundles/tika-parser-code-bundle/pom.xml
index fcf4757..efbcf9a 100644
--- a/tika-parser-bundles/tika-parser-code-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-code-bundle/pom.xml
@@ -1,75 +1,75 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-code-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser code bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-code-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.code.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-code-module;inline=true,
- asm;inline=true,
- tagsoup;inline=true,
- jhighlight;inline=true,
- commons-io;inline=true,
- commons-codec;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.asm.*,
- org.apache.tika.parser.code.*,
- org.apache.tika.parser.executable.*
- </Export-Package>
- <Import-Package>
- *,
- javax.servlet;resolution:=optional,
- javax.servlet.http;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-code-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser code bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-code-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.code.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-code-module;inline=true,
+ asm;inline=true,
+ tagsoup;inline=true,
+ jhighlight;inline=true,
+ commons-io;inline=true,
+ commons-codec;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.asm.*,
+ org.apache.tika.parser.code.*,
+ org.apache.tika.parser.executable.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ javax.servlet;resolution:=optional,
+ javax.servlet.http;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml b/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
index 64203fe..034b1fe 100644
--- a/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-crypto-bundle/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-crypto-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser crypto bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-crypto-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.crypto.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-crypto-module;inline=true,
- bcmail-jdk15on;inline=true,
- bcprov-jdk15on;inline=true,
- bcpkix-jdk15on;inline=true,
- commons-io;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.crypto.*,
- </Export-Package>
- <Import-Package>
- *,
- javax.mail;resolution:=optional,
- javax.mail.internet;resolution:=optional,
- org.bouncycastle.cert;resolution:=optional,
- org.bouncycastle.cert.jcajce;resolution:=optional,
- org.bouncycastle.cert.ocsp;resolution:=optional,
- org.bouncycastle.cms.bc;resolution:=optional,
- org.bouncycastle.operator;resolution:=optional,
- org.bouncycastle.operator.bc;resolution:=optional,
- org.bouncycastle.tsp;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-crypto-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser crypto bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-crypto-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.crypto.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-crypto-module;inline=true,
+ bcmail-jdk15on;inline=true,
+ bcprov-jdk15on;inline=true,
+ bcpkix-jdk15on;inline=true,
+ commons-io;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.crypto.*,
+ </Export-Package>
+ <Import-Package>
+ *,
+ javax.mail;resolution:=optional,
+ javax.mail.internet;resolution:=optional,
+ org.bouncycastle.cert;resolution:=optional,
+ org.bouncycastle.cert.jcajce;resolution:=optional,
+ org.bouncycastle.cert.ocsp;resolution:=optional,
+ org.bouncycastle.cms.bc;resolution:=optional,
+ org.bouncycastle.operator;resolution:=optional,
+ org.bouncycastle.operator.bc;resolution:=optional,
+ org.bouncycastle.tsp;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-database-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-database-bundle/pom.xml b/tika-parser-bundles/tika-parser-database-bundle/pom.xml
index 972dce3..75f1dc0 100644
--- a/tika-parser-bundles/tika-parser-database-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-database-bundle/pom.xml
@@ -1,68 +1,68 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-database-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser database bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-database-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.database.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-database-module;inline=true,
- commons-io;inline=true,
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.jdbc.*
- </Export-Package>
- <Import-Package>
- *,
- org.sqlite;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-database-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser database bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-database-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.database.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-database-module;inline=true,
+ commons-io;inline=true,
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.jdbc.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ org.sqlite;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml b/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
index 742ec99..b7dfa7f 100644
--- a/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-ebook-bundle/pom.xml
@@ -1,72 +1,72 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-ebook-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser ebook bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-ebook-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-bundle</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.ebook.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-ebook-module;inline=true,
- commons-io;inline=true,
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.epub.*
- </Export-Package>
- <Import-Package>
- *
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-ebook-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser ebook bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-ebook-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-bundle</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.ebook.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-ebook-module;inline=true,
+ commons-io;inline=true,
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.epub.*
+ </Export-Package>
+ <Import-Package>
+ *
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
index c03cb4b..b918a7a 100644
--- a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
@@ -1,80 +1,80 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-journal-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser journal bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-journal-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-pdf-bundle</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.journal.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-journal-module;inline=true,
- commons-io;inline=true,
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.journal.*
- </Export-Package>
- <Import-Package>
- *,
- javax.ws.rs.core;resolution:=optional,
- org.apache.cxf.jaxrs.client;resolution:=optional,
- org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
- org.json;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-journal-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser journal bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-journal-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pdf-bundle</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.journal.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-journal-module;inline=true,
+ commons-io;inline=true,
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.journal.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ javax.ws.rs.core;resolution:=optional,
+ org.apache.cxf.jaxrs.client;resolution:=optional,
+ org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
+ org.json;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
index 7b596c9..ab1d1b4 100644
--- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
@@ -1,85 +1,85 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-multimedia-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser multimedia bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-multimedia-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.multimedia.internal.Activator</Bundle-Activator>
- <_runsystempackages>com.sun.xml.bind.marshaller,
- com.sun.xml.internal.bind.marshaller</_runsystempackages>
- <Embed-Dependency>
- tika-parser-multimedia-module;inline=true,
- tika-parser-xmp-commons;inline=true,
- metadata-extractor;inline=true,
- xmpcore;inline=true,
- commons-codec;inline=true,
- commons-io;inline=true,
- commons-exec;inline=true,
- jempbox;inline=true,
- fontbox;inline=true,
- isoparser;inline=true,
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.image.*,
- org.apache.tika.parser.jpeg.*,
- org.apache.tika.parser.audio.*,
- org.apache.tika.parser.video.*,
- org.apache.tika.parser.mp3.*,
- org.apache.tika.parser.mp4.*
- </Export-Package>
- <Import-Package>
- *,
- com.adobe.xmp;resolution:=optional,
- com.adobe.xmp.properties;resolution:=optional,
- android.util;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-multimedia-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser multimedia bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-multimedia-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.multimedia.internal.Activator</Bundle-Activator>
+ <_runsystempackages>com.sun.xml.bind.marshaller,
+ com.sun.xml.internal.bind.marshaller</_runsystempackages>
+ <Embed-Dependency>
+ tika-parser-multimedia-module;inline=true,
+ tika-parser-xmp-commons;inline=true,
+ metadata-extractor;inline=true,
+ xmpcore;inline=true,
+ commons-codec;inline=true,
+ commons-io;inline=true,
+ commons-exec;inline=true,
+ jempbox;inline=true,
+ fontbox;inline=true,
+ isoparser;inline=true,
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.image.*,
+ org.apache.tika.parser.jpeg.*,
+ org.apache.tika.parser.audio.*,
+ org.apache.tika.parser.video.*,
+ org.apache.tika.parser.mp3.*,
+ org.apache.tika.parser.mp4.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ com.adobe.xmp;resolution:=optional,
+ com.adobe.xmp.properties;resolution:=optional,
+ android.util;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
[32/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
index 98278e2..8d94c0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
@@ -1,159 +1,159 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
- */
-public class ID3v22Handler implements ID3Tags {
- private String title;
- private String artist;
- private String album;
- private String year;
- private String composer;
- private String genre;
- private String trackNumber;
- private String albumArtist;
- private String disc;
- private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
- public ID3v22Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
- RawTagIterator tags = new RawV22TagIterator(frame);
- while (tags.hasNext()) {
- RawTag tag = tags.next();
- if (tag.name.equals("TT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TP1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TP2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TAL")) {
- album = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TYE")) {
- year = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("COM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
- } else if (tag.name.equals("TRK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPA")) {
- disc = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCO")) {
- genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
- }
- }
- }
-
- private String getTagString(byte[] data, int offset, int length) {
- return ID3v2Frame.getTagString(data, offset, length);
- }
- private ID3Comment getComment(byte[] data, int offset, int length) {
- return ID3v2Frame.getComment(data, offset, length);
- }
-
- protected static String extractGenre(String rawGenre) {
- int open = rawGenre.indexOf("(");
- int close = rawGenre.indexOf(")");
- if (open == -1 && close == -1) {
- return rawGenre;
- } else if (open < close) {
- String genreStr = rawGenre.substring(0, open).trim();
- try {
- int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
- return ID3Tags.GENRES[genreID];
- } catch(ArrayIndexOutOfBoundsException invalidNum) {
- return genreStr;
- } catch(NumberFormatException notANum) {
- return genreStr;
- }
- } else {
- return null;
- }
- }
-
- public boolean getTagsPresent() {
- return true;
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getArtist() {
- return artist;
- }
-
- public String getAlbum() {
- return album;
- }
-
- public String getYear() {
- return year;
- }
-
- public String getComposer() {
- return composer;
- }
-
- public List<ID3Comment> getComments() {
- return comments;
- }
-
- public String getGenre() {
- return genre;
- }
-
- public String getTrackNumber() {
- return trackNumber;
- }
-
- public String getAlbumArtist() {
- return albumArtist;
- }
-
- public String getDisc() {
- return disc;
- }
-
- /**
- * ID3v22 doesn't have compilations,
- * so returns null;
- */
- public String getCompilation() {
- return null;
- }
-
- private class RawV22TagIterator extends RawTagIterator {
- private RawV22TagIterator(ID3v2Frame frame) {
- frame.super(3, 3, 1, 0);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v22Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV22TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TP1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TP2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TAL")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYE")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPA")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCO")) {
+ genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ protected static String extractGenre(String rawGenre) {
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open == -1 && close == -1) {
+ return rawGenre;
+ } else if (open < close) {
+ String genreStr = rawGenre.substring(0, open).trim();
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+ return ID3Tags.GENRES[genreID];
+ } catch(ArrayIndexOutOfBoundsException invalidNum) {
+ return genreStr;
+ } catch(NumberFormatException notANum) {
+ return genreStr;
+ }
+ } else {
+ return null;
+ }
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ /**
+ * ID3v22 doesn't have compilations,
+ * so returns null;
+ */
+ public String getCompilation() {
+ return null;
+ }
+
+ private class RawV22TagIterator extends RawTagIterator {
+ private RawV22TagIterator(ID3v2Frame frame) {
+ frame.super(3, 3, 1, 0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
index 8c5386d..4b67eda 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
@@ -1,138 +1,138 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
- */
-public class ID3v23Handler implements ID3Tags {
- private String title;
- private String artist;
- private String album;
- private String year;
- private String composer;
- private String genre;
- private String trackNumber;
- private String albumArtist;
- private String disc;
- private String compilation;
- private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
- public ID3v23Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
- RawTagIterator tags = new RawV23TagIterator(frame);
- while (tags.hasNext()) {
- RawTag tag = tags.next();
- if (tag.name.equals("TIT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPE1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPE2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TALB")) {
- album = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TYER")) {
- year = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCOM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("COMM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
- } else if (tag.name.equals("TRCK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPOS")) {
- disc = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCMP")) {
- compilation = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCON")) {
- genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
- }
- }
- }
-
- private String getTagString(byte[] data, int offset, int length) {
- return ID3v2Frame.getTagString(data, offset, length);
- }
- private ID3Comment getComment(byte[] data, int offset, int length) {
- return ID3v2Frame.getComment(data, offset, length);
- }
-
- public boolean getTagsPresent() {
- return true;
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getArtist() {
- return artist;
- }
-
- public String getAlbum() {
- return album;
- }
-
- public String getYear() {
- return year;
- }
-
- public String getComposer() {
- return composer;
- }
-
- public List<ID3Comment> getComments() {
- return comments;
- }
-
- public String getGenre() {
- return genre;
- }
-
- public String getTrackNumber() {
- return trackNumber;
- }
-
- public String getAlbumArtist() {
- return albumArtist;
- }
-
- public String getDisc() {
- return disc;
- }
-
- public String getCompilation() {
- return compilation;
- }
-
- private class RawV23TagIterator extends RawTagIterator {
- private RawV23TagIterator(ID3v2Frame frame) {
- frame.super(4, 4, 1, 2);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private String compilation;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v23Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV23TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCOM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPOS")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCMP")) {
+ compilation = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ public String getCompilation() {
+ return compilation;
+ }
+
+ private class RawV23TagIterator extends RawTagIterator {
+ private RawV23TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 1, 2);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
index 5c16937..caba928 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
@@ -1,143 +1,143 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
-import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
-
-/**
- * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
- */
-public class ID3v24Handler implements ID3Tags {
- private String title;
- private String artist;
- private String album;
- private String year;
- private String composer;
- private String genre;
- private String trackNumber;
- private String albumArtist;
- private String disc;
- private String compilation;
- private List<ID3Comment> comments = new ArrayList<ID3Comment>();
-
- public ID3v24Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
- RawTagIterator tags = new RawV24TagIterator(frame);
- while (tags.hasNext()) {
- RawTag tag = tags.next();
- if (tag.name.equals("TIT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPE1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPE2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TALB")) {
- album = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TYER")) {
- year = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TDRC")) {
- if(year == null) {
- year = getTagString(tag.data, 0, tag.data.length);
- }
- } else if (tag.name.equals("TCOM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("COMM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
- } else if (tag.name.equals("TRCK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TPOS")) {
- disc = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCMP")) {
- compilation = getTagString(tag.data, 0, tag.data.length);
- } else if (tag.name.equals("TCON")) {
- genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
- }
- }
- }
-
- private String getTagString(byte[] data, int offset, int length) {
- return ID3v2Frame.getTagString(data, offset, length);
- }
- private ID3Comment getComment(byte[] data, int offset, int length) {
- return ID3v2Frame.getComment(data, offset, length);
- }
-
- public boolean getTagsPresent() {
- return true;
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getArtist() {
- return artist;
- }
-
- public String getAlbum() {
- return album;
- }
-
- public String getYear() {
- return year;
- }
-
- public String getComposer() {
- return composer;
- }
-
- public List<ID3Comment> getComments() {
- return comments;
- }
-
- public String getGenre() {
- return genre;
- }
-
- public String getTrackNumber() {
- return trackNumber;
- }
-
- public String getAlbumArtist() {
- return albumArtist;
- }
-
- public String getDisc() {
- return disc;
- }
-
- public String getCompilation() {
- return compilation;
- }
-
- private class RawV24TagIterator extends RawTagIterator {
- private RawV24TagIterator(ID3v2Frame frame) {
- frame.super(4, 4, 1, 2);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private String compilation;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v24Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV24TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TDRC")) {
+ if(year == null) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ }
+ } else if (tag.name.equals("TCOM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPOS")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCMP")) {
+ compilation = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ public String getCompilation() {
+ return compilation;
+ }
+
+ private class RawV24TagIterator extends RawTagIterator {
+ private RawV24TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 1, 2);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
index 458c5e2..41298dd 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
@@ -1,424 +1,424 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.Iterator;
-
-import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-/**
- * A frame of ID3v2 data, which is then passed to a handler to
- * be turned into useful data.
- */
-public class ID3v2Frame implements MP3Frame {
- private int majorVersion;
- private int minorVersion;
- private int flags;
- private int length;
- /** Excludes the header size part */
- private byte[] extendedHeader;
- private byte[] data;
-
- public int getMajorVersion() {
- return majorVersion;
- }
-
- public int getMinorVersion() {
- return minorVersion;
- }
-
- public int getFlags() {
- return flags;
- }
-
- public int getLength() {
- return length;
- }
-
- public byte[] getExtendedHeader() {
- return extendedHeader;
- }
-
- public byte[] getData() {
- return data;
- }
-
- /**
- * Returns the next ID3v2 Frame in
- * the file, or null if the next batch of data
- * doesn't correspond to either an ID3v2 header.
- * If no ID3v2 frame could be detected and the passed in input stream is a
- * {@code PushbackInputStream}, the bytes read so far are pushed back so
- * that they can be read again.
- * ID3v2 Frames should come before all Audio ones.
- */
- public static MP3Frame createFrameIfPresent(InputStream inp)
- throws IOException {
- int h1 = inp.read();
- int h2 = inp.read();
- int h3 = inp.read();
-
- // Is it an ID3v2 Frame?
- if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
- int majorVersion = inp.read();
- int minorVersion = inp.read();
- if (majorVersion == -1 || minorVersion == -1) {
- pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
- return null;
- }
- return new ID3v2Frame(majorVersion, minorVersion, inp);
- }
-
- // Not a frame header
- pushBack(inp, h1, h2, h3);
- return null;
- }
-
- /**
- * Pushes bytes back into the stream if possible. This method is called if
- * no ID3v2 header could be found at the current stream position.
- *
- * @param inp the input stream
- * @param bytes the bytes to be pushed back
- * @throws IOException if an error occurs
- */
- private static void pushBack(InputStream inp, int... bytes)
- throws IOException
- {
- if (inp instanceof PushbackInputStream)
- {
- byte[] buf = new byte[bytes.length];
- for (int i = 0; i < bytes.length; i++)
- {
- buf[i] = (byte) bytes[i];
- }
- ((PushbackInputStream) inp).unread(buf);
- }
- }
-
- private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
- throws IOException {
- this.majorVersion = majorVersion;
- this.minorVersion = minorVersion;
-
- // Get the flags and the length
- flags = inp.read();
- length = get7BitsInt(readFully(inp, 4), 0);
-
- // Do we have an extended header?
- if ((flags & 0x02) == 0x02) {
- int size = getInt(readFully(inp, 4));
- extendedHeader = readFully(inp, size);
- }
-
- // Get the frame's data, or at least as much
- // of it as we could do
- data = readFully(inp, length, false);
- }
-
- protected static int getInt(byte[] data) {
- return getInt(data, 0);
- }
-
- protected static int getInt(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- int b2 = data[offset+2] & 0xFF;
- int b3 = data[offset+3] & 0xFF;
- return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
- }
-
- protected static int getInt3(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- int b2 = data[offset+2] & 0xFF;
- return (b0 << 16) + (b1 << 8) + (b2 << 0);
- }
-
- protected static int getInt2(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- return (b0 << 8) + (b1 << 0);
- }
-
- /**
- * AKA a Synchsafe integer.
- * 4 bytes hold a 28 bit number. The highest
- * bit in each byte is always 0 and always ignored.
- */
- protected static int get7BitsInt(byte[] data, int offset) {
- int b0 = data[offset+0] & 0x7F;
- int b1 = data[offset+1] & 0x7F;
- int b2 = data[offset+2] & 0x7F;
- int b3 = data[offset+3] & 0x7F;
- return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
- }
-
- protected static byte[] readFully(InputStream inp, int length)
- throws IOException {
- return readFully(inp, length, true);
- }
- protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
- throws IOException {
- byte[] b = new byte[length];
-
- int pos = 0;
- int read;
- while (pos < length) {
- read = inp.read(b, pos, length-pos);
- if (read == -1) {
- if(shortDataIsFatal) {
- throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
- } else {
- // Give them what we found
- // TODO Log the short read
- return b;
- }
- }
- pos += read;
- }
-
- return b;
- }
-
- protected static class TextEncoding {
- public final boolean doubleByte;
- public final String encoding;
- private TextEncoding(String encoding, boolean doubleByte) {
- this.doubleByte = doubleByte;
- this.encoding = encoding;
- }
- }
- protected static final TextEncoding[] encodings = new TextEncoding[] {
- new TextEncoding("ISO-8859-1", false),
- new TextEncoding("UTF-16", true), // With BOM
- new TextEncoding("UTF-16BE", true), // Without BOM
- new TextEncoding("UTF-8", false)
- };
-
- /**
- * Returns the (possibly null padded) String at the given offset and
- * length. String encoding is held in the first byte;
- */
- protected static String getTagString(byte[] data, int offset, int length) {
- int actualLength = length;
- if (actualLength == 0) {
- return "";
- }
- if (actualLength == 1 && data[offset] == 0) {
- return "";
- }
-
- // Does it have an encoding flag?
- // Detect by the first byte being sub 0x20
- TextEncoding encoding = encodings[0];
- byte maybeEncodingFlag = data[offset];
- if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
- offset++;
- actualLength--;
- encoding = encodings[maybeEncodingFlag];
- }
-
- // Trim off null termination / padding (as present)
- while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
- actualLength -= 2;
- }
- while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
- actualLength--;
- }
- if (actualLength == 0) {
- return "";
- }
-
- // TIKA-1024: If it's UTF-16 (with BOM) and all we
- // have is a naked BOM then short-circuit here
- // (return empty string), because new String(..)
- // gives different results on different JVMs
- if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
- ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
- (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
- return "";
- }
-
- try {
- // Build the base string
- return new String(data, offset, actualLength, encoding.encoding);
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(
- "Core encoding " + encoding.encoding + " is not available", e);
- }
- }
- /**
- * Builds up the ID3 comment, by parsing and extracting
- * the comment string parts from the given data.
- */
- protected static ID3Comment getComment(byte[] data, int offset, int length) {
- // Comments must have an encoding
- int encodingFlag = data[offset];
- if (encodingFlag >= 0 && encodingFlag < encodings.length) {
- // Good, valid flag
- } else {
- // Invalid string
- return null;
- }
-
- TextEncoding encoding = encodings[encodingFlag];
-
- // First is a 3 byte language
- String lang = getString(data, offset+1, 3);
-
- // After that we have [Desc]\0(\0)[Text]
- int descStart = offset+4;
- int textStart = -1;
- String description = null;
- String text = null;
-
- // Find where the description ends
- try {
- for (int i=descStart; i<offset+length; i++) {
- if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
- // Handle LE vs BE on low byte text
- if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
- i++;
- }
- textStart = i+2;
- description = new String(data, descStart, i-descStart, encoding.encoding);
- break;
- }
- if (!encoding.doubleByte && data[i]==0) {
- textStart = i+1;
- description = new String(data, descStart, i-descStart, encoding.encoding);
- break;
- }
- }
-
- // Did we find the end?
- if (textStart > -1) {
- text = new String(data, textStart, offset+length-textStart, encoding.encoding);
- } else {
- // Assume everything is the text
- text = new String(data, descStart, offset+length-descStart, encoding.encoding);
- }
-
- // Return
- return new ID3Comment(lang, description, text);
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(
- "Core encoding " + encoding.encoding + " is not available", e);
- }
- }
-
- /**
- * Returns the String at the given
- * offset and length. Strings are ISO-8859-1
- */
- protected static String getString(byte[] data, int offset, int length) {
- return new String(data, offset, length, ISO_8859_1);
- }
-
-
- /**
- * Iterates over id3v2 raw tags.
- * Create an instance of this that configures the
- * various length and multipliers.
- */
- protected class RawTagIterator implements Iterator<RawTag> {
- private int nameLength;
- private int sizeLength;
- private int sizeMultiplier;
- private int flagLength;
-
- private int offset = 0;
-
- protected RawTagIterator(
- int nameLength, int sizeLength, int sizeMultiplier,
- int flagLength) {
- this.nameLength = nameLength;
- this.sizeLength = sizeLength;
- this.sizeMultiplier = sizeMultiplier;
- this.flagLength = flagLength;
- }
-
- public boolean hasNext() {
- // Check for padding at the end
- return offset < data.length && data[offset] != 0;
- }
-
- public RawTag next() {
- RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
- flagLength, data, offset);
- offset += tag.getSize();
- return tag;
- }
-
- public void remove() {
- }
-
- }
-
- protected static class RawTag {
- private int headerSize;
- protected String name;
- protected int flag;
- protected byte[] data;
-
- private RawTag(
- int nameLength, int sizeLength, int sizeMultiplier,
- int flagLength, byte[] frameData, int offset) {
- headerSize = nameLength + sizeLength + flagLength;
-
- // Name, normally 3 or 4 bytes
- name = getString(frameData, offset, nameLength);
-
- // Size
- int rawSize;
- if (sizeLength == 3) {
- rawSize = getInt3(frameData, offset+nameLength);
- } else {
- rawSize = getInt(frameData, offset+nameLength);
- }
- int size = rawSize * sizeMultiplier;
-
- // Flag
- if (flagLength > 0) {
- if (flagLength == 1) {
- flag = (int)frameData[offset+nameLength+sizeLength];
- } else {
- flag = getInt2(frameData, offset+nameLength+sizeLength);
- }
- }
-
- // Now data
- int copyFrom = offset+nameLength+sizeLength+flagLength;
- size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
- data = new byte[size];
- System.arraycopy(frameData, copyFrom, data, 0, size);
- }
-
- protected int getSize() {
- return headerSize + data.length;
- }
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to
+ * be turned into useful data.
+ */
+public class ID3v2Frame implements MP3Frame {
+ private int majorVersion;
+ private int minorVersion;
+ private int flags;
+ private int length;
+ /** Excludes the header size part */
+ private byte[] extendedHeader;
+ private byte[] data;
+
+ public int getMajorVersion() {
+ return majorVersion;
+ }
+
+ public int getMinorVersion() {
+ return minorVersion;
+ }
+
+ public int getFlags() {
+ return flags;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public byte[] getExtendedHeader() {
+ return extendedHeader;
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Returns the next ID3v2 Frame in
+ * the file, or null if the next batch of data
+ * doesn't correspond to either an ID3v2 header.
+ * If no ID3v2 frame could be detected and the passed in input stream is a
+ * {@code PushbackInputStream}, the bytes read so far are pushed back so
+ * that they can be read again.
+ * ID3v2 Frames should come before all Audio ones.
+ */
+ public static MP3Frame createFrameIfPresent(InputStream inp)
+ throws IOException {
+ int h1 = inp.read();
+ int h2 = inp.read();
+ int h3 = inp.read();
+
+ // Is it an ID3v2 Frame?
+ if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+ int majorVersion = inp.read();
+ int minorVersion = inp.read();
+ if (majorVersion == -1 || minorVersion == -1) {
+ pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
+ return null;
+ }
+ return new ID3v2Frame(majorVersion, minorVersion, inp);
+ }
+
+ // Not a frame header
+ pushBack(inp, h1, h2, h3);
+ return null;
+ }
+
+ /**
+ * Pushes bytes back into the stream if possible. This method is called if
+ * no ID3v2 header could be found at the current stream position.
+ *
+ * @param inp the input stream
+ * @param bytes the bytes to be pushed back
+ * @throws IOException if an error occurs
+ */
+ private static void pushBack(InputStream inp, int... bytes)
+ throws IOException
+ {
+ if (inp instanceof PushbackInputStream)
+ {
+ byte[] buf = new byte[bytes.length];
+ for (int i = 0; i < bytes.length; i++)
+ {
+ buf[i] = (byte) bytes[i];
+ }
+ ((PushbackInputStream) inp).unread(buf);
+ }
+ }
+
+ private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+ throws IOException {
+ this.majorVersion = majorVersion;
+ this.minorVersion = minorVersion;
+
+ // Get the flags and the length
+ flags = inp.read();
+ length = get7BitsInt(readFully(inp, 4), 0);
+
+ // Do we have an extended header?
+ if ((flags & 0x02) == 0x02) {
+ int size = getInt(readFully(inp, 4));
+ extendedHeader = readFully(inp, size);
+ }
+
+ // Get the frame's data, or at least as much
+ // of it as we could do
+ data = readFully(inp, length, false);
+ }
+
+ protected static int getInt(byte[] data) {
+ return getInt(data, 0);
+ }
+
+ protected static int getInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ int b3 = data[offset+3] & 0xFF;
+ return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+ }
+
+ protected static int getInt3(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ return (b0 << 16) + (b1 << 8) + (b2 << 0);
+ }
+
+ protected static int getInt2(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ return (b0 << 8) + (b1 << 0);
+ }
+
+ /**
+ * AKA a Synchsafe integer.
+ * 4 bytes hold a 28 bit number. The highest
+ * bit in each byte is always 0 and always ignored.
+ */
+ protected static int get7BitsInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0x7F;
+ int b1 = data[offset+1] & 0x7F;
+ int b2 = data[offset+2] & 0x7F;
+ int b3 = data[offset+3] & 0x7F;
+ return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+ }
+
+ protected static byte[] readFully(InputStream inp, int length)
+ throws IOException {
+ return readFully(inp, length, true);
+ }
+ protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
+ throws IOException {
+ byte[] b = new byte[length];
+
+ int pos = 0;
+ int read;
+ while (pos < length) {
+ read = inp.read(b, pos, length-pos);
+ if (read == -1) {
+ if(shortDataIsFatal) {
+ throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+ } else {
+ // Give them what we found
+ // TODO Log the short read
+ return b;
+ }
+ }
+ pos += read;
+ }
+
+ return b;
+ }
+
+ protected static class TextEncoding {
+ public final boolean doubleByte;
+ public final String encoding;
+ private TextEncoding(String encoding, boolean doubleByte) {
+ this.doubleByte = doubleByte;
+ this.encoding = encoding;
+ }
+ }
+ protected static final TextEncoding[] encodings = new TextEncoding[] {
+ new TextEncoding("ISO-8859-1", false),
+ new TextEncoding("UTF-16", true), // With BOM
+ new TextEncoding("UTF-16BE", true), // Without BOM
+ new TextEncoding("UTF-8", false)
+ };
+
+ /**
+ * Returns the (possibly null padded) String at the given offset and
+ * length. String encoding is held in the first byte;
+ */
+ protected static String getTagString(byte[] data, int offset, int length) {
+ int actualLength = length;
+ if (actualLength == 0) {
+ return "";
+ }
+ if (actualLength == 1 && data[offset] == 0) {
+ return "";
+ }
+
+ // Does it have an encoding flag?
+ // Detect by the first byte being sub 0x20
+ TextEncoding encoding = encodings[0];
+ byte maybeEncodingFlag = data[offset];
+ if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
+ offset++;
+ actualLength--;
+ encoding = encodings[maybeEncodingFlag];
+ }
+
+ // Trim off null termination / padding (as present)
+ while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
+ actualLength -= 2;
+ }
+ while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
+ actualLength--;
+ }
+ if (actualLength == 0) {
+ return "";
+ }
+
+ // TIKA-1024: If it's UTF-16 (with BOM) and all we
+ // have is a naked BOM then short-circuit here
+ // (return empty string), because new String(..)
+ // gives different results on different JVMs
+ if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+ ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+ (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+ return "";
+ }
+
+ try {
+ // Build the base string
+ return new String(data, offset, actualLength, encoding.encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding " + encoding.encoding + " is not available", e);
+ }
+ }
+ /**
+ * Builds up the ID3 comment, by parsing and extracting
+ * the comment string parts from the given data.
+ */
+ protected static ID3Comment getComment(byte[] data, int offset, int length) {
+ // Comments must have an encoding
+ int encodingFlag = data[offset];
+ if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+ // Good, valid flag
+ } else {
+ // Invalid string
+ return null;
+ }
+
+ TextEncoding encoding = encodings[encodingFlag];
+
+ // First is a 3 byte language
+ String lang = getString(data, offset+1, 3);
+
+ // After that we have [Desc]\0(\0)[Text]
+ int descStart = offset+4;
+ int textStart = -1;
+ String description = null;
+ String text = null;
+
+ // Find where the description ends
+ try {
+ for (int i=descStart; i<offset+length; i++) {
+ if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
+ // Handle LE vs BE on low byte text
+ if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
+ i++;
+ }
+ textStart = i+2;
+ description = new String(data, descStart, i-descStart, encoding.encoding);
+ break;
+ }
+ if (!encoding.doubleByte && data[i]==0) {
+ textStart = i+1;
+ description = new String(data, descStart, i-descStart, encoding.encoding);
+ break;
+ }
+ }
+
+ // Did we find the end?
+ if (textStart > -1) {
+ text = new String(data, textStart, offset+length-textStart, encoding.encoding);
+ } else {
+ // Assume everything is the text
+ text = new String(data, descStart, offset+length-descStart, encoding.encoding);
+ }
+
+ // Return
+ return new ID3Comment(lang, description, text);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding " + encoding.encoding + " is not available", e);
+ }
+ }
+
+ /**
+ * Returns the String at the given
+ * offset and length. Strings are ISO-8859-1
+ */
+ protected static String getString(byte[] data, int offset, int length) {
+ return new String(data, offset, length, ISO_8859_1);
+ }
+
+
+ /**
+ * Iterates over id3v2 raw tags.
+ * Create an instance of this that configures the
+ * various length and multipliers.
+ */
+ protected class RawTagIterator implements Iterator<RawTag> {
+ private int nameLength;
+ private int sizeLength;
+ private int sizeMultiplier;
+ private int flagLength;
+
+ private int offset = 0;
+
+ protected RawTagIterator(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength) {
+ this.nameLength = nameLength;
+ this.sizeLength = sizeLength;
+ this.sizeMultiplier = sizeMultiplier;
+ this.flagLength = flagLength;
+ }
+
+ public boolean hasNext() {
+ // Check for padding at the end
+ return offset < data.length && data[offset] != 0;
+ }
+
+ public RawTag next() {
+ RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+ flagLength, data, offset);
+ offset += tag.getSize();
+ return tag;
+ }
+
+ public void remove() {
+ }
+
+ }
+
+ protected static class RawTag {
+ private int headerSize;
+ protected String name;
+ protected int flag;
+ protected byte[] data;
+
+ private RawTag(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength, byte[] frameData, int offset) {
+ headerSize = nameLength + sizeLength + flagLength;
+
+ // Name, normally 3 or 4 bytes
+ name = getString(frameData, offset, nameLength);
+
+ // Size
+ int rawSize;
+ if (sizeLength == 3) {
+ rawSize = getInt3(frameData, offset+nameLength);
+ } else {
+ rawSize = getInt(frameData, offset+nameLength);
+ }
+ int size = rawSize * sizeMultiplier;
+
+ // Flag
+ if (flagLength > 0) {
+ if (flagLength == 1) {
+ flag = (int)frameData[offset+nameLength+sizeLength];
+ } else {
+ flag = getInt2(frameData, offset+nameLength+sizeLength);
+ }
+ }
+
+ // Now data
+ int copyFrom = offset+nameLength+sizeLength+flagLength;
+ size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+ data = new byte[size];
+ System.arraycopy(frameData, copyFrom, data, 0, size);
+ }
+
+ protected int getSize() {
+ return headerSize + data.length;
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
index 54b9ae9..12d0f2d 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * This is used to parse Lyrics3 tag information
- * from an MP3 file, if available.
- * Handles lyrics tags of up to 10kb in size.
- * Will process any ID3v1 tag data if present.
- * Ignores extended ID3v1 data in the lyrics block
- *
- * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
- */
-public class LyricsHandler {
- boolean foundLyrics = false;
- String lyricsText = null;
- ID3v1Handler id3v1 = null;
-
- public LyricsHandler(InputStream stream, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- this(getSuffix(stream, 10240+128));
- }
-
- /**
- * Looks for the Lyrics data, which will be
- * just before the ID3v1 data (if present),
- * and process it.
- * Also sets things up for the ID3v1
- * processing if required.
- * Creates from the last 128 bytes of a stream.
- */
- protected LyricsHandler(byte[] tagData)
- throws IOException, SAXException, TikaException {
- if(tagData.length < 128) {
- return;
- }
-
- // Is there ID3v1 data?
- byte[] last128 = new byte[128];
- System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
- id3v1 = new ID3v1Handler(last128);
-
- if(tagData.length < 137) {
- return;
- }
-
- // Are there lyrics? Look for the closing Lyrics tag
- // at the end to decide if there is any
- int lookat = tagData.length - 9;
- if(id3v1.found) {
- lookat -= 128;
- }
- if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
- tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
- tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
- tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
- tagData[lookat+8] == '0') {
- foundLyrics = true;
-
- // The length (6 bytes) comes just before LYRICS200, and is the
- // size including the LYRICSBEGIN but excluding the
- // length+LYRICS200 at the end.
- int length = Integer.parseInt(
- new String(tagData, lookat-6, 6, UTF_8)
- );
-
- String lyrics = new String(
- tagData, lookat-length+5, length-11,
- US_ASCII
- );
-
- // Tags are a 3 letter code, 5 digit length, then data
- int pos = 0;
- while(pos < lyrics.length()-8) {
- String tagName = lyrics.substring(pos, pos+3);
- int tagLen = Integer.parseInt(
- lyrics.substring(pos+3, pos+8)
- );
- int startPos = pos + 8;
- int endPos = startPos + tagLen;
-
- if(tagName.equals("LYR")) {
- lyricsText = lyrics.substring(startPos, endPos);
- }
-
- pos = endPos;
- }
- }
- }
-
- public boolean hasID3v1() {
- if(id3v1 == null || id3v1.found == false) {
- return false;
- }
- return true;
- }
- public boolean hasLyrics() {
- return lyricsText != null && lyricsText.length() > 0;
- }
-
- /**
- * Reads and returns the last <code>length</code> bytes from the
- * given stream.
- * @param stream input stream
- * @param length number of bytes from the end to read and return
- * @return stream the <code>InputStream</code> to read from.
- * @throws IOException if the stream could not be read from.
- */
- protected static byte[] getSuffix(InputStream stream, int length)
- throws IOException {
- byte[] buffer = new byte[2 * length];
- int bytesInBuffer = 0;
-
- int n = stream.read(buffer);
- while (n != -1) {
- bytesInBuffer += n;
- if (bytesInBuffer == buffer.length) {
- System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
- bytesInBuffer = length;
- }
- n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
- }
-
- if (bytesInBuffer < length) {
- length = bytesInBuffer;
- }
-
- byte[] result = new byte[length];
- System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
- return result;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ * from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+ boolean foundLyrics = false;
+ String lyricsText = null;
+ ID3v1Handler id3v1 = null;
+
+ public LyricsHandler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(getSuffix(stream, 10240+128));
+ }
+
+ /**
+ * Looks for the Lyrics data, which will be
+ * just before the ID3v1 data (if present),
+ * and process it.
+ * Also sets things up for the ID3v1
+ * processing if required.
+ * Creates from the last 128 bytes of a stream.
+ */
+ protected LyricsHandler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if(tagData.length < 128) {
+ return;
+ }
+
+ // Is there ID3v1 data?
+ byte[] last128 = new byte[128];
+ System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+ id3v1 = new ID3v1Handler(last128);
+
+ if(tagData.length < 137) {
+ return;
+ }
+
+ // Are there lyrics? Look for the closing Lyrics tag
+ // at the end to decide if there is any
+ int lookat = tagData.length - 9;
+ if(id3v1.found) {
+ lookat -= 128;
+ }
+ if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
+ tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+ tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+ tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+ tagData[lookat+8] == '0') {
+ foundLyrics = true;
+
+ // The length (6 bytes) comes just before LYRICS200, and is the
+ // size including the LYRICSBEGIN but excluding the
+ // length+LYRICS200 at the end.
+ int length = Integer.parseInt(
+ new String(tagData, lookat-6, 6, UTF_8)
+ );
+
+ String lyrics = new String(
+ tagData, lookat-length+5, length-11,
+ US_ASCII
+ );
+
+ // Tags are a 3 letter code, 5 digit length, then data
+ int pos = 0;
+ while(pos < lyrics.length()-8) {
+ String tagName = lyrics.substring(pos, pos+3);
+ int tagLen = Integer.parseInt(
+ lyrics.substring(pos+3, pos+8)
+ );
+ int startPos = pos + 8;
+ int endPos = startPos + tagLen;
+
+ if(tagName.equals("LYR")) {
+ lyricsText = lyrics.substring(startPos, endPos);
+ }
+
+ pos = endPos;
+ }
+ }
+ }
+
+ public boolean hasID3v1() {
+ if(id3v1 == null || id3v1.found == false) {
+ return false;
+ }
+ return true;
+ }
+ public boolean hasLyrics() {
+ return lyricsText != null && lyricsText.length() > 0;
+ }
+
+ /**
+ * Reads and returns the last <code>length</code> bytes from the
+ * given stream.
+ * @param stream input stream
+ * @param length number of bytes from the end to read and return
+ * @return stream the <code>InputStream</code> to read from.
+ * @throws IOException if the stream could not be read from.
+ */
+ protected static byte[] getSuffix(InputStream stream, int length)
+ throws IOException {
+ byte[] buffer = new byte[2 * length];
+ int bytesInBuffer = 0;
+
+ int n = stream.read(buffer);
+ while (n != -1) {
+ bytesInBuffer += n;
+ if (bytesInBuffer == buffer.length) {
+ System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+ bytesInBuffer = length;
+ }
+ n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ }
+
+ if (bytesInBuffer < length) {
+ length = bytesInBuffer;
+ }
+
+ byte[] result = new byte[length];
+ System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
index a88265f..923be8a 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
@@ -1,25 +1,25 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-
-/**
- * A frame in an MP3 file, such as ID3v2 Tags or some
- * audio.
- */
-public interface MP3Frame {
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ * audio.
+ */
+public interface MP3Frame {
+}
[26/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
index 226a447..97eaf46 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
@@ -1,176 +1,176 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Description Note: not always exists An index chunk has the following format:
- * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
- * directory chunk 0008: Directory index entries (to quickref/free area) The
- * quickref area in an PMGI is the same as in an PMGL The format of a directory
- * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
- * ENCINT: directory listing chunk which starts with name Encoded Integers aka
- * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
- * indicates "continued to the next byte". Bytes are stored most significant to
- * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
- * 0x3515.
- *
- * <p>
- * Note: This class is not in use
- *
- * {@link http://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original/?show-translation-form=1 }
- *
- *
- */
-public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
- private static final long serialVersionUID = -2092282339894303701L;
- private byte[] signature;
- private long free_space; /* 4 */
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- public ChmPmgiHeader() {
- signature = ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8); /* 0 (PMGI) */
- }
-
- private int getDataRemained() {
- return dataRemained;
- }
-
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
- int count) throws ChmParsingException {
- int index = -1;
- ChmAssert.assertByteArrayNotNull(data);
- ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
- ChmAssert.assertPositiveInt(count);
- this.setDataRemained(data.length);
- index = ChmCommons.indexOf(data,
- ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
-
- if (index >= 0)
- System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
- else{
- //Some chm documents (actually most of them) do not contain
- //PMGI header, in this case, we just notice about it.
- }
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- }
-
- private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
- ChmAssert.assertByteArrayNotNull(data);
-
- if (4 > getDataRemained())
- throw new ChmParsingException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- /**
- * Returns pmgi signature if exists
- *
- * @return signature
- */
- public byte[] getSignature() {
- return signature;
- }
-
- /**
- * Sets pmgi signature
- *
- * @param signature
- */
- protected void setSignature(byte[] signature) {
- this.signature = signature;
- }
-
- /**
- * Returns pmgi free space
- *
- * @return free_space
- */
- public long getFreeSpace() {
- return free_space;
- }
-
- /**
- * Sets pmgi free space
- *
- * @param free_space
- */
- protected void setFreeSpace(long free_space) {
- this.free_space = free_space;
- }
-
- /**
- * Returns textual representation of the pmgi header
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("signature:=" + new String(getSignature(), UTF_8) + ", ");
- sb.append("free space:=" + getFreeSpace()
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- // @Override
- public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException {
- /* we only know how to deal with a 0x8 byte structures */
- if (data.length < ChmConstants.CHM_PMGI_LEN)
- throw new TikaException("we only know how to deal with a 0x8 byte structures");
-
- /* unmarshal fields */
- chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN);
- chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace()));
-
- /* check structure */
- if (!Arrays.equals(chmPmgiHeader.getSignature(),
- ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8)))
- throw new TikaException(
- "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Description Note: not always exists An index chunk has the following format:
+ * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
+ * directory chunk 0008: Directory index entries (to quickref/free area) The
+ * quickref area in an PMGI is the same as in an PMGL The format of a directory
+ * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
+ * ENCINT: directory listing chunk which starts with name Encoded Integers aka
+ * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
+ * indicates "continued to the next byte". Bytes are stored most significant to
+ * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
+ * 0x3515.
+ *
+ * <p>
+ * Note: This class is not in use
+ *
+ * {@link http://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original/?show-translation-form=1 }
+ *
+ *
+ */
+public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
+ private static final long serialVersionUID = -2092282339894303701L;
+ private byte[] signature;
+ private long free_space; /* 4 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public ChmPmgiHeader() {
+ signature = ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8); /* 0 (PMGI) */
+ }
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
+ int count) throws ChmParsingException {
+ int index = -1;
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
+ ChmAssert.assertPositiveInt(count);
+ this.setDataRemained(data.length);
+ index = ChmCommons.indexOf(data,
+ ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
+
+ if (index >= 0)
+ System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
+ else{
+ //Some chm documents (actually most of them) do not contain
+ //PMGI header, in this case, we just notice about it.
+ }
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
+ ChmAssert.assertByteArrayNotNull(data);
+
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ /**
+ * Returns pmgi signature if exists
+ *
+ * @return signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets pmgi signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns pmgi free space
+ *
+ * @return free_space
+ */
+ public long getFreeSpace() {
+ return free_space;
+ }
+
+ /**
+ * Sets pmgi free space
+ *
+ * @param free_space
+ */
+ protected void setFreeSpace(long free_space) {
+ this.free_space = free_space;
+ }
+
+ /**
+ * Returns textual representation of the pmgi header
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("signature:=" + new String(getSignature(), UTF_8) + ", ");
+ sb.append("free space:=" + getFreeSpace()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException {
+ /* we only know how to deal with a 0x8 byte structures */
+ if (data.length < ChmConstants.CHM_PMGI_LEN)
+ throw new TikaException("we only know how to deal with a 0x8 byte structures");
+
+ /* unmarshal fields */
+ chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN);
+ chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace()));
+
+ /* check structure */
+ if (!Arrays.equals(chmPmgiHeader.getSignature(),
+ ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8)))
+ throw new TikaException(
+ "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
index 7c8a5cd..abb7175 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
@@ -1,206 +1,206 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Description There are two types of directory chunks -- index chunks, and
- * listing chunks. The index chunk will be omitted if there is only one listing
- * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
- * DWORD Length of free space and/or quickref area at end of directory chunk
- * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
- * reading directory in sequence (-1 if this is the first listing chunk) 0010:
- * DWORD Chunk number of next listing chunk when reading directory in sequence
- * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
- * quickref area) Sorted by filename; the sort is case-insensitive The quickref
- * area is written backwards from the end of the chunk. One quickref entry
- * exists for every n entries in the file, where n is calculated as 1 + (1 <<
- * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
- * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
- * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
- * Offset of entry 3n from entry 0 ... The format of a directory listing entry
- * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
- * content section ENCINT: offset ENCINT: length The offset is from the
- * beginning of the content section the file is in, after the section has been
- * decompressed (if appropriate). The length also refers to length of the file
- * in the section after decompression. There are two kinds of file represented
- * in the directory: user data and format related files. The files which are
- * format-related have names which begin with '::', the user data files have
- * names which begin with "/".
- *
- * {@link http
- * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
- * /?show-translation-form=1 }
- *
- * @author olegt
- *
- */
-public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
- private static final long serialVersionUID = -6139486487475923593L;
- private byte[] signature;
- private long free_space; /* 4 */
- private long unknown_0008; /* 8 */
- private int block_prev; /* c */
- private int block_next; /* 10 */
-
- /* local usage */
- private int dataRemained;
- private int currentPlace = 0;
-
- public ChmPmglHeader() {
- signature = ChmConstants.PMGL.getBytes(UTF_8); /*
- * 0
- * (PMGL
- * )
- */
- }
-
- private int getDataRemained() {
- return dataRemained;
- }
-
- private void setDataRemained(int dataRemained) {
- this.dataRemained = dataRemained;
- }
-
- private int getCurrentPlace() {
- return currentPlace;
- }
-
- private void setCurrentPlace(int currentPlace) {
- this.currentPlace = currentPlace;
- }
-
- public long getFreeSpace() {
- return free_space;
- }
-
- public void setFreeSpace(long free_space) throws TikaException {
- if (free_space < 0) {
- throw new TikaException("Bad PMGLheader.FreeSpace="+free_space);
- }
- this.free_space = free_space;
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", ");
- sb.append("free space:=" + getFreeSpace() + ", ");
- sb.append("unknown0008:=" + getUnknown0008() + ", ");
- sb.append("prev block:=" + getBlockPrev() + ", ");
- sb.append("next block:=" + getBlockNext()
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
- int count) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- this.setDataRemained(data.length);
- System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
- this.setCurrentPlace(this.getCurrentPlace() + count);
- this.setDataRemained(this.getDataRemained() - count);
- }
-
- private int unmarshalInt32(byte[] data) throws TikaException {
- ChmAssert.assertByteArrayNotNull(data);
- int dest;
- if (4 > this.getDataRemained())
- throw new TikaException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- this.setDataRemained(this.getDataRemained() - 4);
- return dest;
- }
-
- private long unmarshalUInt32(byte[] data) throws ChmParsingException {
- ChmAssert.assertByteArrayNotNull(data);
- long dest;
- if (4 > getDataRemained())
- throw new ChmParsingException("4 > dataLenght");
- dest = (data[this.getCurrentPlace()] & 0xff)
- | (data[this.getCurrentPlace() + 1] & 0xff) << 8
- | (data[this.getCurrentPlace() + 2] & 0xff) << 16
- | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
-
- setDataRemained(this.getDataRemained() - 4);
- this.setCurrentPlace(this.getCurrentPlace() + 4);
- return dest;
- }
-
- // @Override
- public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
- if (data.length < ChmConstants.CHM_PMGL_LEN)
- throw new TikaException(ChmPmglHeader.class.getName()
- + " we only know how to deal with a 0x14 byte structures");
-
- /* unmarshal fields */
- chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
- ChmConstants.CHM_SIGNATURE_LEN);
- chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data));
- chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data));
- chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data));
- chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
-
- /* check structure */
- if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL))
- throw new ChmParsingException(ChmPmglHeader.class.getName()
- + " pmgl != pmgl.signature");
- }
-
- public byte[] getSignature() {
- return signature;
- }
-
- protected void setSignature(byte[] signature) {
- this.signature = signature;
- }
-
- public long getUnknown0008() {
- return unknown_0008;
- }
-
- protected void setUnknown0008(long unknown_0008) {
- this.unknown_0008 = unknown_0008;
- }
-
- public int getBlockPrev() {
- return block_prev;
- }
-
- protected void setBlockPrev(int block_prev) {
- this.block_prev = block_prev;
- }
-
- public int getBlockNext() {
- return block_next;
- }
-
- protected void setBlockNext(int block_next) {
- this.block_next = block_next;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Description There are two types of directory chunks -- index chunks, and
+ * listing chunks. The index chunk will be omitted if there is only one listing
+ * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
+ * DWORD Length of free space and/or quickref area at end of directory chunk
+ * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
+ * reading directory in sequence (-1 if this is the first listing chunk) 0010:
+ * DWORD Chunk number of next listing chunk when reading directory in sequence
+ * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
+ * quickref area) Sorted by filename; the sort is case-insensitive The quickref
+ * area is written backwards from the end of the chunk. One quickref entry
+ * exists for every n entries in the file, where n is calculated as 1 + (1 <<
+ * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
+ * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
+ * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
+ * Offset of entry 3n from entry 0 ... The format of a directory listing entry
+ * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
+ * content section ENCINT: offset ENCINT: length The offset is from the
+ * beginning of the content section the file is in, after the section has been
+ * decompressed (if appropriate). The length also refers to length of the file
+ * in the section after decompression. There are two kinds of file represented
+ * in the directory: user data and format related files. The files which are
+ * format-related have names which begin with '::', the user data files have
+ * names which begin with "/".
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ *
+ * @author olegt
+ *
+ */
+public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
+ private static final long serialVersionUID = -6139486487475923593L;
+ private byte[] signature;
+ private long free_space; /* 4 */
+ private long unknown_0008; /* 8 */
+ private int block_prev; /* c */
+ private int block_next; /* 10 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public ChmPmglHeader() {
+ signature = ChmConstants.PMGL.getBytes(UTF_8); /*
+ * 0
+ * (PMGL
+ * )
+ */
+ }
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ public long getFreeSpace() {
+ return free_space;
+ }
+
+ public void setFreeSpace(long free_space) throws TikaException {
+ if (free_space < 0) {
+ throw new TikaException("Bad PMGLheader.FreeSpace="+free_space);
+ }
+ this.free_space = free_space;
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", ");
+ sb.append("free space:=" + getFreeSpace() + ", ");
+ sb.append("unknown0008:=" + getUnknown0008() + ", ");
+ sb.append("prev block:=" + getBlockPrev() + ", ");
+ sb.append("next block:=" + getBlockNext()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
+ int count) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ this.setDataRemained(data.length);
+ System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private int unmarshalInt32(byte[] data) throws TikaException {
+ ChmAssert.assertByteArrayNotNull(data);
+ int dest;
+ if (4 > this.getDataRemained())
+ throw new TikaException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data) throws ChmParsingException {
+ ChmAssert.assertByteArrayNotNull(data);
+ long dest;
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = (data[this.getCurrentPlace()] & 0xff)
+ | (data[this.getCurrentPlace() + 1] & 0xff) << 8
+ | (data[this.getCurrentPlace() + 2] & 0xff) << 16
+ | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
+ if (data.length < ChmConstants.CHM_PMGL_LEN)
+ throw new TikaException(ChmPmglHeader.class.getName()
+ + " we only know how to deal with a 0x14 byte structures");
+
+ /* unmarshal fields */
+ chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data));
+ chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data));
+ chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data));
+ chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
+
+ /* check structure */
+ if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL))
+ throw new ChmParsingException(ChmPmglHeader.class.getName()
+ + " pmgl != pmgl.signature");
+ }
+
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ public long getUnknown0008() {
+ return unknown_0008;
+ }
+
+ protected void setUnknown0008(long unknown_0008) {
+ this.unknown_0008 = unknown_0008;
+ }
+
+ public int getBlockPrev() {
+ return block_prev;
+ }
+
+ protected void setBlockPrev(int block_prev) {
+ this.block_prev = block_prev;
+ }
+
+ public int getBlockNext() {
+ return block_next;
+ }
+
+ protected void setBlockNext(int block_next) {
+ this.block_next = block_next;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
index 05aa411..c413e07 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
@@ -1,151 +1,151 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-
-/**
- * The format of a directory listing entry is as follows: BYTE: length of name
- * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
- * length The offset is from the beginning of the content section the file is
- * in, after the section has been decompressed (if appropriate). The length also
- * refers to length of the file in the section after decompression. There are
- * two kinds of file represented in the directory: user data and format related
- * files. The files which are format-related have names which begin with '::',
- * the user data files have names which begin with "/".
- *
- */
-public class DirectoryListingEntry {
- /* Length of the entry name */
- private int name_length;
- /* Entry name or directory name */
- private String name;
- /* Entry type */
- private ChmCommons.EntryType entryType;
- /* Entry offset */
- private int offset;
- /* Entry size */
- private int length;
-
- public DirectoryListingEntry() {
-
- }
-
- /**
- * Constructs directoryListingEntry
- *
- * @param name_length
- * int
- * @param name
- * String
- * @param isCompressed
- * ChmCommons.EntryType
- * @param offset
- * int
- * @param length
- * int
- * @throws TikaException
- */
- public DirectoryListingEntry(int name_length, String name,
- ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException {
- ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length);
- setNameLength(name_length);
- setName(name);
- setEntryType(isCompressed);
- setOffset(offset);
- setLength(length);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator"));
- sb.append("name:=" + getName() + System.getProperty("line.separator"));
- sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator"));
- sb.append("offset:=" + getOffset() + System.getProperty("line.separator"));
- sb.append("length:=" + getLength());
- return sb.toString();
- }
-
- /**
- * Returns an entry name length
- *
- * @return int
- */
- public int getNameLength() {
- return name_length;
- }
-
- /**
- * Sets an entry name length
- *
- * @param name_length
- * int
- */
- protected void setNameLength(int name_length) {
- this.name_length = name_length;
- }
-
- /**
- * Returns an entry name
- *
- * @return String
- */
- public String getName() {
- return name;
- }
-
- /**
- * Sets entry name
- *
- * @param name
- * String
- */
- protected void setName(String name) {
- this.name = name;
- }
-
- /**
- * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
- *
- * @return ChmCommons.EntryType
- */
- public ChmCommons.EntryType getEntryType() {
- return entryType;
- }
-
- protected void setEntryType(ChmCommons.EntryType entryType) {
- this.entryType = entryType;
- }
-
- public int getOffset() {
- return offset;
- }
-
- protected void setOffset(int offset) {
- this.offset = offset;
- }
-
- public int getLength() {
- return length;
- }
-
- protected void setLength(int length) {
- this.length = length;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+/**
+ * The format of a directory listing entry is as follows: BYTE: length of name
+ * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
+ * length The offset is from the beginning of the content section the file is
+ * in, after the section has been decompressed (if appropriate). The length also
+ * refers to length of the file in the section after decompression. There are
+ * two kinds of file represented in the directory: user data and format related
+ * files. The files which are format-related have names which begin with '::',
+ * the user data files have names which begin with "/".
+ *
+ */
+public class DirectoryListingEntry {
+ /* Length of the entry name */
+ private int name_length;
+ /* Entry name or directory name */
+ private String name;
+ /* Entry type */
+ private ChmCommons.EntryType entryType;
+ /* Entry offset */
+ private int offset;
+ /* Entry size */
+ private int length;
+
+ public DirectoryListingEntry() {
+
+ }
+
+ /**
+ * Constructs directoryListingEntry
+ *
+ * @param name_length
+ * int
+ * @param name
+ * String
+ * @param isCompressed
+ * ChmCommons.EntryType
+ * @param offset
+ * int
+ * @param length
+ * int
+ * @throws TikaException
+ */
+ public DirectoryListingEntry(int name_length, String name,
+ ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException {
+ ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length);
+ setNameLength(name_length);
+ setName(name);
+ setEntryType(isCompressed);
+ setOffset(offset);
+ setLength(length);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator"));
+ sb.append("name:=" + getName() + System.getProperty("line.separator"));
+ sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator"));
+ sb.append("offset:=" + getOffset() + System.getProperty("line.separator"));
+ sb.append("length:=" + getLength());
+ return sb.toString();
+ }
+
+ /**
+ * Returns an entry name length
+ *
+ * @return int
+ */
+ public int getNameLength() {
+ return name_length;
+ }
+
+ /**
+ * Sets an entry name length
+ *
+ * @param name_length
+ * int
+ */
+ protected void setNameLength(int name_length) {
+ this.name_length = name_length;
+ }
+
+ /**
+ * Returns an entry name
+ *
+ * @return String
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Sets entry name
+ *
+ * @param name
+ * String
+ */
+ protected void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
+ *
+ * @return ChmCommons.EntryType
+ */
+ public ChmCommons.EntryType getEntryType() {
+ return entryType;
+ }
+
+ protected void setEntryType(ChmCommons.EntryType entryType) {
+ this.entryType = entryType;
+ }
+
+ public int getOffset() {
+ return offset;
+ }
+
+ protected void setOffset(int offset) {
+ this.offset = offset;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ protected void setLength(int length) {
+ this.length = length;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
index a332690..cdedc3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
@@ -1,169 +1,169 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.assertion;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmAccessor;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * Contains chm extractor assertions
- */
-public class ChmAssert {
- /**
- * Checks a validity of the chmBlockSegment parameters
- *
- * @param data
- * byte[]
- * @param resetTable
- * ChmLzxcResetTable
- * @param blockNumber
- * int
- * @param lzxcBlockOffset
- * int
- * @param lzxcBlockLength
- * int
- * @throws TikaException
- */
- public static final void assertChmBlockSegment(byte[] data,
- ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
- int lzxcBlockLength) throws TikaException {
- if ((data == null))
- throw new TikaException("data[] is null");
-
- if ((data.length <= 0))
- throw new TikaException("data[] length should be greater than zero");
-
- if (resetTable == null)
- throw new TikaException("resetTable is null");
-
- if (resetTable.getBlockAddress().length <= 1)
- throw new TikaException("resetTable.getBlockAddress().length should be greater than zero");
-
- if (blockNumber < 0)
- throw new TikaException("blockNumber should be positive number");
-
- if (lzxcBlockOffset < 0)
- throw new TikaException("lzxcBlockOffset should be positive number");
-
- if (lzxcBlockLength < 0)
- throw new TikaException("lzxcBlockLength should be positive number");
- }
-
- /**
- * Checks if InputStream is not null
- *
- * @param is
- * InputStream
- * @throws ChmParsingException
- * @throws IOException
- */
- public static final void assertInputStreamNotNull(InputStream is) throws IOException {
- if (is == null)
- throw new IOException("input sream is null");
- }
-
- /**
- * Checks validity of ChmAccessor parameters
- *
- * @param data
- * @param chmItsfHeader
- * @param count
- * @throws ChmParsingException
- */
- public static final void assertChmAccessorParameters(byte[] data,
- ChmAccessor<?> chmAccessor, int count) throws ChmParsingException {
- assertByteArrayNotNull(data);
- assertChmAccessorNotNull(chmAccessor);
- }
-
- /**
- * Checks if byte[] is not null
- *
- * @param data
- * @throws ChmParsingException
- */
- public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException {
- if (data == null)
- throw new ChmParsingException("byte[] data is null");
- }
-
- /**
- * Checks if ChmAccessor is not null In case of null throws exception
- *
- * @param ChmAccessor
- * @throws ChmParsingException
- */
- public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException {
- if (chmAccessor == null)
- throw new ChmParsingException("chm header is null");
- }
-
- /**
- * Checks validity of the DirectoryListingEntry's parameters In case of
- * invalid parameter(s) throws an exception
- *
- * @param name_length
- * length of the chm entry name
- * @param name
- * chm entry name
- * @param entryType
- * EntryType
- * @param offset
- * @param length
- * @throws ChmParsingException
- */
- public static final void assertDirectoryListingEntry(int name_length,
- String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException {
- if (name_length < 0)
- throw new ChmParsingException("invalid name length");
- if (name == null)
- throw new ChmParsingException("invalid name");
-
- if ((entryType != ChmCommons.EntryType.COMPRESSED)
- && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
- throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
-
- if (offset < 0)
- throw new ChmParsingException("invalid offset");
-
- if (length < 0)
- throw new ChmParsingException("invalid length");
- }
-
- public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException {
- if (index >= dataLength)
- throw new ChmParsingException("cannot parse chm file index > data.length");
- }
-
- /**
- * Checks if int param is greater than zero In case param <=0 throws an
- * exception
- *
- * @param param
- * @throws ChmParsingException
- */
- public static void assertPositiveInt(int param) throws ChmParsingException {
- if (param <= 0)
- throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero");
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.assertion;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmAccessor;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Contains chm extractor assertions
+ */
+public class ChmAssert {
+ /**
+ * Checks a validity of the chmBlockSegment parameters
+ *
+ * @param data
+ * byte[]
+ * @param resetTable
+ * ChmLzxcResetTable
+ * @param blockNumber
+ * int
+ * @param lzxcBlockOffset
+ * int
+ * @param lzxcBlockLength
+ * int
+ * @throws TikaException
+ */
+ public static final void assertChmBlockSegment(byte[] data,
+ ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+ int lzxcBlockLength) throws TikaException {
+ if ((data == null))
+ throw new TikaException("data[] is null");
+
+ if ((data.length <= 0))
+ throw new TikaException("data[] length should be greater than zero");
+
+ if (resetTable == null)
+ throw new TikaException("resetTable is null");
+
+ if (resetTable.getBlockAddress().length <= 1)
+ throw new TikaException("resetTable.getBlockAddress().length should be greater than zero");
+
+ if (blockNumber < 0)
+ throw new TikaException("blockNumber should be positive number");
+
+ if (lzxcBlockOffset < 0)
+ throw new TikaException("lzxcBlockOffset should be positive number");
+
+ if (lzxcBlockLength < 0)
+ throw new TikaException("lzxcBlockLength should be positive number");
+ }
+
+ /**
+ * Checks if InputStream is not null
+ *
+ * @param is
+ * InputStream
+ * @throws ChmParsingException
+ * @throws IOException
+ */
+ public static final void assertInputStreamNotNull(InputStream is) throws IOException {
+ if (is == null)
+ throw new IOException("input sream is null");
+ }
+
+ /**
+ * Checks validity of ChmAccessor parameters
+ *
+ * @param data
+ * @param chmItsfHeader
+ * @param count
+ * @throws ChmParsingException
+ */
+ public static final void assertChmAccessorParameters(byte[] data,
+ ChmAccessor<?> chmAccessor, int count) throws ChmParsingException {
+ assertByteArrayNotNull(data);
+ assertChmAccessorNotNull(chmAccessor);
+ }
+
+ /**
+ * Checks if byte[] is not null
+ *
+ * @param data
+ * @throws ChmParsingException
+ */
+ public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException {
+ if (data == null)
+ throw new ChmParsingException("byte[] data is null");
+ }
+
+ /**
+ * Checks if ChmAccessor is not null In case of null throws exception
+ *
+ * @param ChmAccessor
+ * @throws ChmParsingException
+ */
+ public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException {
+ if (chmAccessor == null)
+ throw new ChmParsingException("chm header is null");
+ }
+
+ /**
+ * Checks validity of the DirectoryListingEntry's parameters In case of
+ * invalid parameter(s) throws an exception
+ *
+ * @param name_length
+ * length of the chm entry name
+ * @param name
+ * chm entry name
+ * @param entryType
+ * EntryType
+ * @param offset
+ * @param length
+ * @throws ChmParsingException
+ */
+ public static final void assertDirectoryListingEntry(int name_length,
+ String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException {
+ if (name_length < 0)
+ throw new ChmParsingException("invalid name length");
+ if (name == null)
+ throw new ChmParsingException("invalid name");
+
+ if ((entryType != ChmCommons.EntryType.COMPRESSED)
+ && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
+ throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
+
+ if (offset < 0)
+ throw new ChmParsingException("invalid offset");
+
+ if (length < 0)
+ throw new ChmParsingException("invalid length");
+ }
+
+ public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException {
+ if (index >= dataLength)
+ throw new ChmParsingException("cannot parse chm file index > data.length");
+ }
+
+ /**
+ * Checks if int param is greater than zero In case param <=0 throws an
+ * exception
+ *
+ * @param param
+ * @throws ChmParsingException
+ */
+ public static void assertPositiveInt(int param) throws ChmParsingException {
+ if (param <= 0)
+ throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero");
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
index a7fdf60..cded7f2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
@@ -1,361 +1,361 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-public class ChmCommons {
- /* Prevents initialization */
- private ChmCommons() {
- }
-
- public static void assertByteArrayNotNull(byte[] data) throws TikaException {
- if (data == null)
- throw new TikaException("byte[] is null");
- }
-
- /**
- * Represents entry types: uncompressed, compressed
- */
- public enum EntryType {
- UNCOMPRESSED, COMPRESSED
- }
-
- /**
- * Represents lzx states: started decoding, not started decoding
- */
- public enum LzxState {
- STARTED_DECODING, NOT_STARTED_DECODING
- }
-
- /**
- * Represents intel file states during decompression
- */
- public enum IntelState {
- STARTED, NOT_STARTED
- }
-
- /**
- * Represents lzx block types in order to decompress differently
- */
- public final static int UNDEFINED = 0;
- public final static int VERBATIM = 1;
- public final static int ALIGNED_OFFSET = 2;
- public final static int UNCOMPRESSED = 3;
-
- /**
- * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
- * i.e 2^X
- *
- * @param window
- * chmLzxControlData.getWindowSize()
- *
- * @return window size
- */
- public static int getWindowSize(int window) {
- int win = 0;
- while (window > 1) {
- window >>>= 1;
- win++;
- }
- return win;
- }
-
- public static byte[] getChmBlockSegment(byte[] data,
- ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
- int lzxcBlockLength) throws TikaException {
- ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
- lzxcBlockOffset, lzxcBlockLength);
- int blockLength = -1;
- // TODO add int_max_value checking
- if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
- blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
- .getBlockAddress()[blockNumber]);
- } else {
- /* new code */
- if (blockNumber >= resetTable.getBlockAddress().length)
- blockLength = 0;
- else
- /* end new code */
- blockLength = (int) (lzxcBlockLength - resetTable
- .getBlockAddress()[blockNumber]);
- }
- byte[] t = ChmCommons
- .copyOfRange(
- data,
- (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
- (int) (lzxcBlockOffset
- + resetTable.getBlockAddress()[blockNumber] + blockLength));
- return (t != null) ? t : new byte[1];
- }
-
- /**
- * Returns textual representation of LangID
- *
- * @param langID
- *
- * @return language name
- */
- public static String getLanguage(long langID) {
- /* Potential problem with casting */
- switch ((int) langID) {
- case 1025:
- return "Arabic";
- case 1069:
- return "Basque";
- case 1027:
- return "Catalan";
- case 2052:
- return "Chinese (Simplified)";
- case 1028:
- return "Chinese (Traditional)";
- case 1029:
- return "Czech";
- case 1030:
- return "Danish";
- case 1043:
- return "Dutch";
- case 1033:
- return "English (United States)";
- case 1035:
- return "Finnish";
- case 1036:
- return "French";
- case 1031:
- return "German";
- case 1032:
- return "Greek";
- case 1037:
- return "Hebrew";
- case 1038:
- return "Hungarian";
- case 1040:
- return "Italian";
- case 1041:
- return "Japanese";
- case 1042:
- return "Korean";
- case 1044:
- return "Norwegian";
- case 1045:
- return "Polish";
- case 2070:
- return "Portuguese";
- case 1046:
- return "Portuguese (Brazil)";
- case 1049:
- return "Russian";
- case 1051:
- return "Slovakian";
- case 1060:
- return "Slovenian";
- case 3082:
- return "Spanish";
- case 1053:
- return "Swedish";
- case 1055:
- return "Turkish";
- default:
- return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
- }
- }
-
- /**
- * Checks skippable patterns
- *
- * @param directoryListingEntry
- *
- * @return boolean
- */
- public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
- return (directoryListingEntry.getName().startsWith("/$")
- || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
- .getName().startsWith("::")) ? true : false;
- }
-
- /**
- * Writes byte[][] to the file
- *
- * @param buffer
- * @param fileToBeSaved
- * file name
- * @throws TikaException
- */
- public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException {
- FileOutputStream output = null;
- if (buffer != null && fileToBeSaved != null
- && !ChmCommons.isEmpty(fileToBeSaved)) {
- try {
- output = new FileOutputStream(fileToBeSaved);
- for (byte[] bufferEntry : buffer) {
- output.write(bufferEntry);
- }
- } catch (FileNotFoundException e) {
- throw new TikaException(e.getMessage());
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if (output != null)
- try {
- output.flush();
- output.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- /**
- * Reverses the order of given array
- *
- * @param array
- */
- public static void reverse(byte[] array) {
- if (array == null) {
- return;
- }
- int i = 0;
- int j = array.length - 1;
- byte tmp;
- while (j > i) {
- tmp = array[j];
- array[j] = array[i];
- array[i] = tmp;
- j--;
- i++;
- }
- }
-
- /**
- * Returns an index of the reset table
- *
- * @param text
- * @param pattern
- * @return index of the reset table
- * @throws ChmParsingException
- */
- public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException {
- return (indexOf(text, pattern)) - 4;
- }
-
- /**
- * Searches some pattern in byte[]
- *
- * @param text
- * byte[]
- * @param pattern
- * byte[]
- * @return an index, if nothing found returns -1
- * @throws ChmParsingException
- */
- public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
- int[] next = null;
- int i = 0, j = -1;
-
- /* Preprocessing */
- if (pattern != null && text != null) {
- next = new int[pattern.length];
- next[0] = -1;
- } else
- throw new ChmParsingException("pattern and/or text should not be null");
-
- /* Computes a failure function */
- while (i < pattern.length - 1) {
- if (j == -1 || pattern[i] == pattern[j]) {
- i++;
- j++;
- if (pattern[i] != pattern[j])
- next[i] = j;
- else
- next[i] = next[j];
- } else
- j = next[j];
- }
-
- /* Reinitializes local variables */
- i = j = 0;
-
- /* Matching */
- while (i < text.length && j < pattern.length) {
- if (j == -1 || pattern[j] == text[i]) {
- i++;
- j++;
- } else
- j = next[j];
- }
- if (j == pattern.length)
- return (i - j); // match found at offset i - M
- else
- return -1; // not found
- }
-
- /**
- * Searches for some pattern in the directory listing entry list
- *
- * @param list
- * @param pattern
- * @return an index, if nothing found returns -1
- */
- public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
- int place = 0;
- for (DirectoryListingEntry directoryListingEntry : list) {
- if (directoryListingEntry.toString().contains(pattern)) return place;
- ++place;
- }
- return -1;// not found
- }
-
- /*
- * This method is added because of supporting of Java 5
- */
- public static byte[] copyOfRange(byte[] original, int from, int to) {
- checkCopyOfRangeParams(original, from, to);
- int newLength = to - from;
- if (newLength < 0)
- throw new IllegalArgumentException(from + " > " + to);
- byte[] copy = new byte[newLength];
- System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
- return copy;
- }
-
- private static void checkCopyOfRangeParams(byte[] original, int from, int to) {
- if (original == null)
- throw new NullPointerException("array is null");
- if (from < 0)
- throw new IllegalArgumentException(from + " should be > 0");
- if (to < 0)
- throw new IllegalArgumentException(to + " should be > 0");
- }
-
- /*
- * This method is added because of supporting of Java 5
- */
- public static boolean isEmpty(String str) {
- return str == null || str.length() == 0;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmCommons {
+ /* Prevents initialization */
+ private ChmCommons() {
+ }
+
+ public static void assertByteArrayNotNull(byte[] data) throws TikaException {
+ if (data == null)
+ throw new TikaException("byte[] is null");
+ }
+
+ /**
+ * Represents entry types: uncompressed, compressed
+ */
+ public enum EntryType {
+ UNCOMPRESSED, COMPRESSED
+ }
+
+ /**
+ * Represents lzx states: started decoding, not started decoding
+ */
+ public enum LzxState {
+ STARTED_DECODING, NOT_STARTED_DECODING
+ }
+
+ /**
+ * Represents intel file states during decompression
+ */
+ public enum IntelState {
+ STARTED, NOT_STARTED
+ }
+
+ /**
+ * Represents lzx block types in order to decompress differently
+ */
+ public final static int UNDEFINED = 0;
+ public final static int VERBATIM = 1;
+ public final static int ALIGNED_OFFSET = 2;
+ public final static int UNCOMPRESSED = 3;
+
+ /**
+ * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
+ * i.e 2^X
+ *
+ * @param window
+ * chmLzxControlData.getWindowSize()
+ *
+ * @return window size
+ */
+ public static int getWindowSize(int window) {
+ int win = 0;
+ while (window > 1) {
+ window >>>= 1;
+ win++;
+ }
+ return win;
+ }
+
+ public static byte[] getChmBlockSegment(byte[] data,
+ ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+ int lzxcBlockLength) throws TikaException {
+ ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
+ lzxcBlockOffset, lzxcBlockLength);
+ int blockLength = -1;
+ // TODO add int_max_value checking
+ if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
+ blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
+ .getBlockAddress()[blockNumber]);
+ } else {
+ /* new code */
+ if (blockNumber >= resetTable.getBlockAddress().length)
+ blockLength = 0;
+ else
+ /* end new code */
+ blockLength = (int) (lzxcBlockLength - resetTable
+ .getBlockAddress()[blockNumber]);
+ }
+ byte[] t = ChmCommons
+ .copyOfRange(
+ data,
+ (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
+ (int) (lzxcBlockOffset
+ + resetTable.getBlockAddress()[blockNumber] + blockLength));
+ return (t != null) ? t : new byte[1];
+ }
+
+ /**
+ * Returns textual representation of LangID
+ *
+ * @param langID
+ *
+ * @return language name
+ */
+ public static String getLanguage(long langID) {
+ /* Potential problem with casting */
+ switch ((int) langID) {
+ case 1025:
+ return "Arabic";
+ case 1069:
+ return "Basque";
+ case 1027:
+ return "Catalan";
+ case 2052:
+ return "Chinese (Simplified)";
+ case 1028:
+ return "Chinese (Traditional)";
+ case 1029:
+ return "Czech";
+ case 1030:
+ return "Danish";
+ case 1043:
+ return "Dutch";
+ case 1033:
+ return "English (United States)";
+ case 1035:
+ return "Finnish";
+ case 1036:
+ return "French";
+ case 1031:
+ return "German";
+ case 1032:
+ return "Greek";
+ case 1037:
+ return "Hebrew";
+ case 1038:
+ return "Hungarian";
+ case 1040:
+ return "Italian";
+ case 1041:
+ return "Japanese";
+ case 1042:
+ return "Korean";
+ case 1044:
+ return "Norwegian";
+ case 1045:
+ return "Polish";
+ case 2070:
+ return "Portuguese";
+ case 1046:
+ return "Portuguese (Brazil)";
+ case 1049:
+ return "Russian";
+ case 1051:
+ return "Slovakian";
+ case 1060:
+ return "Slovenian";
+ case 3082:
+ return "Spanish";
+ case 1053:
+ return "Swedish";
+ case 1055:
+ return "Turkish";
+ default:
+ return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
+ }
+ }
+
+ /**
+ * Checks skippable patterns
+ *
+ * @param directoryListingEntry
+ *
+ * @return boolean
+ */
+ public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
+ return (directoryListingEntry.getName().startsWith("/$")
+ || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
+ .getName().startsWith("::")) ? true : false;
+ }
+
+ /**
+ * Writes byte[][] to the file
+ *
+ * @param buffer
+ * @param fileToBeSaved
+ * file name
+ * @throws TikaException
+ */
+ public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException {
+ FileOutputStream output = null;
+ if (buffer != null && fileToBeSaved != null
+ && !ChmCommons.isEmpty(fileToBeSaved)) {
+ try {
+ output = new FileOutputStream(fileToBeSaved);
+ for (byte[] bufferEntry : buffer) {
+ output.write(bufferEntry);
+ }
+ } catch (FileNotFoundException e) {
+ throw new TikaException(e.getMessage());
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (output != null)
+ try {
+ output.flush();
+ output.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /**
+ * Reverses the order of given array
+ *
+ * @param array
+ */
+ public static void reverse(byte[] array) {
+ if (array == null) {
+ return;
+ }
+ int i = 0;
+ int j = array.length - 1;
+ byte tmp;
+ while (j > i) {
+ tmp = array[j];
+ array[j] = array[i];
+ array[i] = tmp;
+ j--;
+ i++;
+ }
+ }
+
+ /**
+ * Returns an index of the reset table
+ *
+ * @param text
+ * @param pattern
+ * @return index of the reset table
+ * @throws ChmParsingException
+ */
+ public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException {
+ return (indexOf(text, pattern)) - 4;
+ }
+
+ /**
+ * Searches some pattern in byte[]
+ *
+ * @param text
+ * byte[]
+ * @param pattern
+ * byte[]
+ * @return an index, if nothing found returns -1
+ * @throws ChmParsingException
+ */
+ public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
+ int[] next = null;
+ int i = 0, j = -1;
+
+ /* Preprocessing */
+ if (pattern != null && text != null) {
+ next = new int[pattern.length];
+ next[0] = -1;
+ } else
+ throw new ChmParsingException("pattern and/or text should not be null");
+
+ /* Computes a failure function */
+ while (i < pattern.length - 1) {
+ if (j == -1 || pattern[i] == pattern[j]) {
+ i++;
+ j++;
+ if (pattern[i] != pattern[j])
+ next[i] = j;
+ else
+ next[i] = next[j];
+ } else
+ j = next[j];
+ }
+
+ /* Reinitializes local variables */
+ i = j = 0;
+
+ /* Matching */
+ while (i < text.length && j < pattern.length) {
+ if (j == -1 || pattern[j] == text[i]) {
+ i++;
+ j++;
+ } else
+ j = next[j];
+ }
+ if (j == pattern.length)
+ return (i - j); // match found at offset i - M
+ else
+ return -1; // not found
+ }
+
+ /**
+ * Searches for some pattern in the directory listing entry list
+ *
+ * @param list
+ * @param pattern
+ * @return an index, if nothing found returns -1
+ */
+ public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+ int place = 0;
+ for (DirectoryListingEntry directoryListingEntry : list) {
+ if (directoryListingEntry.toString().contains(pattern)) return place;
+ ++place;
+ }
+ return -1;// not found
+ }
+
+ /*
+ * This method is added because of supporting of Java 5
+ */
+ public static byte[] copyOfRange(byte[] original, int from, int to) {
+ checkCopyOfRangeParams(original, from, to);
+ int newLength = to - from;
+ if (newLength < 0)
+ throw new IllegalArgumentException(from + " > " + to);
+ byte[] copy = new byte[newLength];
+ System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
+ return copy;
+ }
+
+ private static void checkCopyOfRangeParams(byte[] original, int from, int to) {
+ if (original == null)
+ throw new NullPointerException("array is null");
+ if (from < 0)
+ throw new IllegalArgumentException(from + " should be > 0");
+ if (to < 0)
+ throw new IllegalArgumentException(to + " should be > 0");
+ }
+
+ /*
+ * This method is added because of supporting of Java 5
+ */
+ public static boolean isEmpty(String str) {
+ return str == null || str.length() == 0;
+ }
+
+}
[37/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-office-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
index c9db0da..f743aa2 100644
--- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml
@@ -1,141 +1,141 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-office-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser office bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-bundle</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-web-bundle</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-bundle</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.office.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-office-module;inline=true,
- commons-lang;inline=true,
- commons-io;inline=true,
- commons-codec;inline=true,
- poi;inline=true,
- poi-scratchpad;inline=true,
- poi-ooxml;inline=true,
- poi-ooxml-schemas;inline=true;
- jackcess;inline=true,
- jackcess-encrypt;inline=true,
- java-libpst;inline=true,
- curvesapi;inline=true,
- xmlbeans;inline=true,
- bcprov-jdk15on;inline=true,
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.chm.*,
- org.apache.tika.parser.mbox.*,
- org.apache.tika.parser.microsoft.*,
- org.apache.tika.parser.microsoft.ooxml.*,
- org.apache.tika.parser.opc.*,
- org.apache.tika.parser.odf.*,
- org.apache.tika.parser.opendocument.*,
- org.apache.tika.parser.rtf.*
- </Export-Package>
- <Import-Package>
- !org.junit,
- !org.junit.*,
- !junit.*,
- *,
- com.microsoft.schemas.office.powerpoint;resolution:=optional,
- com.microsoft.schemas.office.word;resolution:=optional,
- com.sun.javadoc;resolution:=optional,
- com.sun.xml.bind.marshaller;resolution:=optional,
- com.sun.xml.internal.bind.marshaller;resolution:=optional,
- com.sun.msv.datatype;resolution:=optional,
- com.sun.msv.datatype.xsd;resolution:=optional,
- com.sun.tools.javadoc;resolution:=optional,
- org.apache.crimson.jaxp;resolution:=optional,
- org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
- org.apache.tools.ant;resolution:=optional,
- org.apache.tools.ant.taskdefs;resolution:=optional,
- org.apache.tools.ant.types;resolution:=optional,
- org.apache.xml.resolver;resolution:=optional,
- org.apache.xml.resolver.tools;resolution:=optional,
- org.apache.xml.security;resolution:=optional,
- org.apache.xml.security.c14n;resolution:=optional,
- org.apache.xml.security.utils;resolution:=optional,
- org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
- org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
- org.bouncycastle.cert;resolution:=optional,
- org.bouncycastle.cert.jcajce;resolution:=optional,
- org.bouncycastle.cert.ocsp;resolution:=optional,
- org.bouncycastle.cms;resolution:=optional,
- org.bouncycastle.cms.bc;resolution:=optional,
- org.bouncycastle.operator;resolution:=optional,
- org.bouncycastle.operator.bc;resolution:=optional,
- org.bouncycastle.tsp;resolution:=optional,
- org.etsi.uri.x01903.v14;resolution:=optional,
- org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
- org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
- org.apache.tika.parser.html.HtmlParser;resolution:=optional,
- org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-office-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser office bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-office-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-bundle</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-web-bundle</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-bundle</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.office.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-office-module;inline=true,
+ commons-lang;inline=true,
+ commons-io;inline=true,
+ commons-codec;inline=true,
+ poi;inline=true,
+ poi-scratchpad;inline=true,
+ poi-ooxml;inline=true,
+ poi-ooxml-schemas;inline=true;
+ jackcess;inline=true,
+ jackcess-encrypt;inline=true,
+ java-libpst;inline=true,
+ curvesapi;inline=true,
+ xmlbeans;inline=true,
+ bcprov-jdk15on;inline=true,
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.chm.*,
+ org.apache.tika.parser.mbox.*,
+ org.apache.tika.parser.microsoft.*,
+ org.apache.tika.parser.microsoft.ooxml.*,
+ org.apache.tika.parser.opc.*,
+ org.apache.tika.parser.odf.*,
+ org.apache.tika.parser.opendocument.*,
+ org.apache.tika.parser.rtf.*
+ </Export-Package>
+ <Import-Package>
+ !org.junit,
+ !org.junit.*,
+ !junit.*,
+ *,
+ com.microsoft.schemas.office.powerpoint;resolution:=optional,
+ com.microsoft.schemas.office.word;resolution:=optional,
+ com.sun.javadoc;resolution:=optional,
+ com.sun.xml.bind.marshaller;resolution:=optional,
+ com.sun.xml.internal.bind.marshaller;resolution:=optional,
+ com.sun.msv.datatype;resolution:=optional,
+ com.sun.msv.datatype.xsd;resolution:=optional,
+ com.sun.tools.javadoc;resolution:=optional,
+ org.apache.crimson.jaxp;resolution:=optional,
+ org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+ org.apache.tools.ant;resolution:=optional,
+ org.apache.tools.ant.taskdefs;resolution:=optional,
+ org.apache.tools.ant.types;resolution:=optional,
+ org.apache.xml.resolver;resolution:=optional,
+ org.apache.xml.resolver.tools;resolution:=optional,
+ org.apache.xml.security;resolution:=optional,
+ org.apache.xml.security.c14n;resolution:=optional,
+ org.apache.xml.security.utils;resolution:=optional,
+ org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
+ org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
+ org.bouncycastle.cert;resolution:=optional,
+ org.bouncycastle.cert.jcajce;resolution:=optional,
+ org.bouncycastle.cert.ocsp;resolution:=optional,
+ org.bouncycastle.cms;resolution:=optional,
+ org.bouncycastle.cms.bc;resolution:=optional,
+ org.bouncycastle.operator;resolution:=optional,
+ org.bouncycastle.operator.bc;resolution:=optional,
+ org.bouncycastle.tsp;resolution:=optional,
+ org.etsi.uri.x01903.v14;resolution:=optional,
+ org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
+ org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
+ org.apache.tika.parser.html.HtmlParser;resolution:=optional,
+ org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-package-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-package-bundle/pom.xml b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
index 4d292d7..d2b55d7 100644
--- a/tika-parser-bundles/tika-parser-package-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-package-bundle/pom.xml
@@ -1,80 +1,80 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-package-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser package bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.pkg.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-package-module;inline=true,
- commons-io;inline=true,
- commons-codec;inline=true,
- xz;inline=true,
- commons-compress;inline=true,
- junrar;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.pkg.*,
- org.apache.tika.parser.iwork.*
- </Export-Package>
- <Import-Package>
- *,
- org.apache.commons.vfs2;resolution:=optional,
- org.apache.commons.vfs2.provider;resolution:=optional,
- org.apache.commons.vfs2.util;resolution:=optional,
-
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-package-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser package bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.pkg.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-package-module;inline=true,
+ commons-io;inline=true,
+ commons-codec;inline=true,
+ xz;inline=true,
+ commons-compress;inline=true,
+ junrar;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.pkg.*,
+ org.apache.tika.parser.iwork.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ org.apache.commons.vfs2;resolution:=optional,
+ org.apache.commons.vfs2.provider;resolution:=optional,
+ org.apache.commons.vfs2.util;resolution:=optional,
+
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index 25eef2e..fe1a269 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -1,109 +1,109 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-pdf-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser pdf bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-pdf-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-pdf-module;inline=true,
- tika-parser-multimedia-module;inline=true,
- tika-parser-xmp-commons;inline=true,
- commons-io;inline=true,
- pdfbox;inline=true,
- pdfbox-tools;inline=true,
- pdfbox-debugger;inline=true,
- bcmail-jdk15on;inline=true,
- bcprov-jdk15on;inline=true,
- fontbox;inline=true,
- jempbox;inline=true,
- bcpkix-jdk15on;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.pdf.*
- </Export-Package>
- <Import-Package>
- *,
- com.ibm.icu.text;resolution:=optional,
- com.coremedia.iso;resolution:=optional,
- com.coremedia.iso.boxes;resolution:=optional,
- com.coremedia.iso.boxes.apple;resolution:=optional,
- com.coremedia.iso.boxes.sampleentry;resolution:=optional,
- com.drew.imaging.jpeg;resolution:=optional,
- com.drew.imaging.riff;resolution:=optional,
- com.drew.imaging.tiff;resolution:=optional,
- com.drew.imaging.webp;resolution:=optional,
- com.drew.lang;resolution:=optional,
- com.drew.metadata;resolution:=optional,
- com.drew.metadata.exif;resolution:=optional,
- com.drew.metadata.iptc;resolution:=optional,
- com.drew.metadata.jpeg;resolution:=optional,
- com.googlecode.mp4parser;resolution:=optional,
- com.googlecode.mp4parser.boxes.apple;resolution:=optional,
- com.googlecode.mp4parser.util;resolution:=optional,
- javax.mail;resolution:=optional,
- javax.mail.internet;resolution:=optional,
- org.bouncycastle.cert;resolution:=optional,
- org.bouncycastle.cert.jcajce;resolution:=optional,
- org.bouncycastle.cert.ocsp;resolution:=optional,
- org.bouncycastle.cms.bc;resolution:=optional,
- org.bouncycastle.operator;resolution:=optional,
- org.bouncycastle.operator.bc;resolution:=optional,
- org.bouncycastle.tsp;resolution:=optional,
- org.apache.commons.exec;resolution:=optional,
- org.apache.commons.exec.environment;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-pdf-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser pdf bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pdf-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-pdf-module;inline=true,
+ tika-parser-multimedia-module;inline=true,
+ tika-parser-xmp-commons;inline=true,
+ commons-io;inline=true,
+ pdfbox;inline=true,
+ pdfbox-tools;inline=true,
+ pdfbox-debugger;inline=true,
+ bcmail-jdk15on;inline=true,
+ bcprov-jdk15on;inline=true,
+ fontbox;inline=true,
+ jempbox;inline=true,
+ bcpkix-jdk15on;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.pdf.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ com.ibm.icu.text;resolution:=optional,
+ com.coremedia.iso;resolution:=optional,
+ com.coremedia.iso.boxes;resolution:=optional,
+ com.coremedia.iso.boxes.apple;resolution:=optional,
+ com.coremedia.iso.boxes.sampleentry;resolution:=optional,
+ com.drew.imaging.jpeg;resolution:=optional,
+ com.drew.imaging.riff;resolution:=optional,
+ com.drew.imaging.tiff;resolution:=optional,
+ com.drew.imaging.webp;resolution:=optional,
+ com.drew.lang;resolution:=optional,
+ com.drew.metadata;resolution:=optional,
+ com.drew.metadata.exif;resolution:=optional,
+ com.drew.metadata.iptc;resolution:=optional,
+ com.drew.metadata.jpeg;resolution:=optional,
+ com.googlecode.mp4parser;resolution:=optional,
+ com.googlecode.mp4parser.boxes.apple;resolution:=optional,
+ com.googlecode.mp4parser.util;resolution:=optional,
+ javax.mail;resolution:=optional,
+ javax.mail.internet;resolution:=optional,
+ org.bouncycastle.cert;resolution:=optional,
+ org.bouncycastle.cert.jcajce;resolution:=optional,
+ org.bouncycastle.cert.ocsp;resolution:=optional,
+ org.bouncycastle.cms.bc;resolution:=optional,
+ org.bouncycastle.operator;resolution:=optional,
+ org.bouncycastle.operator.bc;resolution:=optional,
+ org.bouncycastle.tsp;resolution:=optional,
+ org.apache.commons.exec;resolution:=optional,
+ org.apache.commons.exec.environment;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml b/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
index 578ecab..9408859 100644
--- a/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-scientific-bundle/pom.xml
@@ -1,202 +1,202 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-scientific-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser scientific bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-scientific-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.ctakes</groupId>
- <artifactId>ctakes-core</artifactId>
- <version>3.2.2</version>
- <scope>provided</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.scientific.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-scientific-module;inline=true,
- commons-csv;inline=true,
- commons-exec;inline=true,
- commons-codec;inline=true,
- commons-io;inline=true,
- json-simple;inline=true,
- sis-utility;inline=true,
- sis-netcdf;inline=true,
- sis-metadata;inline=true,
- sis-storage;inline=true,
- netcdf4;inline=true,
- grib;inline=true,
- cdm;inline=true,
- httpservices;inline=true,
- jmatio;inline=true,
- jsr-275;inline=true,
- jcip-annotations;inline=true,
- opennlp-tools;inline=true,
- opennlp-maxent;inline=true,
- jwnl;inline=true,
- geoapi;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.ctakes.*,
- org.apache.tika.parser.dif.*,
- org.apache.tika.parser.envi.*,
- org.apache.tika.parser.gdal.*
- </Export-Package>
- <Import-Package>
- !org.apache.ctakes.*,
- !org.apache.uima.*,
- *,
- colorspace;resolution:=optional,
- org.apache.sis;resolution:=optional,
- org.apache.sis.distance;resolution:=optional,
- org.apache.sis.geometry;resolution:=optional,
- com.beust.jcommander;resolution:=optional,
- com.google.common.base;resolution:=optional,
- com.google.common.math;resolution:=optional,
- com.google.protobuf;resolution:=optional,
- ucar.units;resolution:=optional,
- ucar.httpservices;resolution:=optional,
- ucar.nc2.util;resolution:=optional,
- ucar.nc2.util.cache;resolution:=optional,
- ucar.nc2.dataset;resolution:=optional,
- ucar.nc2;resolution:=optional,
- ucar.nc2.constants;resolution:=optional,
- ucar.nc2.dt;resolution:=optional,
- ucar.nc2.dt.grid;resolution:=optional,
- ucar.nc2.ft;resolution:=optional,
- ucar.nc2.iosp;resolution:=optional,
- ucar.nc2.iosp.hdf4;resolution:=optional,
- ucar.nc2.ncml;resolution:=optional,
- ucar.nc2.stream;resolution:=optional,
- ucar.nc2.time;resolution:=optional,
- ucar.nc2.units;resolution:=optional,
- ucar.nc2.wmo;resolution:=optional,
- ucar.nc2.write;resolution:=optional,
- ucar.ma2;resolution:=optional,
- ucar.grib;resolution:=optional,
- ucar.grib.grib1;resolution:=optional,
- ucar.grib.grib2;resolution:=optional,
- ucar.grid;resolution:=optional,
- ucar.unidata.geoloc;resolution:=optional,
- ucar.unidata.geoloc.projection;resolution:=optional,
- ucar.unidata.geoloc.projection.proj4;resolution:=optional,
- ucar.unidata.geoloc.projection.sat;resolution:=optional,
- ucar.unidata.io;resolution:=optional,
- ucar.unidata.util;resolution:=optional,
- com.jmatio.io;resolution:=optional,
- com.sun.jna;resolution:=optional,
- com.sun.jna.ptr;resolution:=optional,
- com.sun.xml.bind.marshaller;resolution:=optional,
- com.sun.xml.internal.bind.marshaller;resolution:=optional,
- com.sun.msv.datatype;resolution:=optional,
- com.sun.msv.datatype.xsd;resolution:=optional,
- com.sun.tools.javadoc;resolution:=optional,
- sun.misc;resolution:=optional,
- sun.reflect.generics.reflectiveObjects;resolution:=optional,
- org.quartz;resolution:=optional,
- org.quartz.impl;resolution:=optional,
- icc;resolution:=optional,
- org.jdom;resolution:=optional,
- org.jdom.input;resolution:=optional,
- org.jdom.output;resolution:=optional,
- org.jdom2;resolution:=optional,
- org.jdom2.input;resolution:=optional,
- org.jdom2.input.sax;resolution:=optional,
- org.jdom2.output;resolution:=optional,
- org.jdom2.filter;resolution:=optional,
- javax.measure.converter;resolution:=optional,
- javax.servlet.annotation;resolution:=optional,
- javax.servlet;resolution:=optional,
- javax.servlet.http;resolution:=optional,
- jj2000.j2k.codestream;resolution:=optional,
- jj2000.j2k.codestream.reader;resolution:=optional,
- jj2000.j2k.decoder;resolution:=optional,
- jj2000.j2k.entropy.decoder;resolution:=optional,
- jj2000.j2k.fileformat.reader;resolution:=optional,
- jj2000.j2k.image;resolution:=optional,
- jj2000.j2k.image.invcomptransf;resolution:=optional,
- jj2000.j2k.image.output;resolution:=optional,
- jj2000.j2k.io;resolution:=optional,
- jj2000.j2k.quantization.dequantizer;resolution:=optional,
- jj2000.j2k.roi;resolution:=optional,
- jj2000.j2k.util;resolution:=optional,
- jj2000.j2k.wavelet.synthesis;resolution:=optional,
- org.itadaki.bzip2;resolution:=optional,
- org.jsoup;resolution:=optional,
- org.jsoup.nodes;resolution:=optional,
- org.jsoup.select;resolution:=optional,
- opennlp.maxent;resolution:=optional,
- opennlp.tools.namefind;resolution:=optional,
- net.didion.jwnl;resolution:=optional,
- org.joda.time;resolution:=optional,
- org.joda.time.chrono;resolution:=optional,
- org.joda.time.field;resolution:=optional,
- org.joda.time.format;resolution:=optional,
- org.apache.http;resolution:=optional,
- org.apache.http.auth;resolution:=optional,
- org.apache.http.client;resolution:=optional,
- org.apache.http.client.entity;resolution:=optional,
- org.apache.http.client.methods;resolution:=optional,
- org.apache.http.conn;resolution:=optional,
- org.apache.http.conn.scheme;resolution:=optional,
- org.apache.http.cookie;resolution:=optional,
- org.apache.http.entity;resolution:=optional,
- org.apache.http.impl.client;resolution:=optional,
- org.apache.http.impl.conn;resolution:=optional,
- org.apache.http.message;resolution:=optional,
- org.apache.http.params;resolution:=optional,
- org.apache.http.protocol;resolution:=optional,
- org.apache.http.util;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-scientific-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser scientific bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-scientific-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.scientific.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-scientific-module;inline=true,
+ commons-csv;inline=true,
+ commons-exec;inline=true,
+ commons-codec;inline=true,
+ commons-io;inline=true,
+ json-simple;inline=true,
+ sis-utility;inline=true,
+ sis-netcdf;inline=true,
+ sis-metadata;inline=true,
+ sis-storage;inline=true,
+ netcdf4;inline=true,
+ grib;inline=true,
+ cdm;inline=true,
+ httpservices;inline=true,
+ jmatio;inline=true,
+ jsr-275;inline=true,
+ jcip-annotations;inline=true,
+ opennlp-tools;inline=true,
+ opennlp-maxent;inline=true,
+ jwnl;inline=true,
+ geoapi;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.ctakes.*,
+ org.apache.tika.parser.dif.*,
+ org.apache.tika.parser.envi.*,
+ org.apache.tika.parser.gdal.*
+ </Export-Package>
+ <Import-Package>
+ !org.apache.ctakes.*,
+ !org.apache.uima.*,
+ *,
+ colorspace;resolution:=optional,
+ org.apache.sis;resolution:=optional,
+ org.apache.sis.distance;resolution:=optional,
+ org.apache.sis.geometry;resolution:=optional,
+ com.beust.jcommander;resolution:=optional,
+ com.google.common.base;resolution:=optional,
+ com.google.common.math;resolution:=optional,
+ com.google.protobuf;resolution:=optional,
+ ucar.units;resolution:=optional,
+ ucar.httpservices;resolution:=optional,
+ ucar.nc2.util;resolution:=optional,
+ ucar.nc2.util.cache;resolution:=optional,
+ ucar.nc2.dataset;resolution:=optional,
+ ucar.nc2;resolution:=optional,
+ ucar.nc2.constants;resolution:=optional,
+ ucar.nc2.dt;resolution:=optional,
+ ucar.nc2.dt.grid;resolution:=optional,
+ ucar.nc2.ft;resolution:=optional,
+ ucar.nc2.iosp;resolution:=optional,
+ ucar.nc2.iosp.hdf4;resolution:=optional,
+ ucar.nc2.ncml;resolution:=optional,
+ ucar.nc2.stream;resolution:=optional,
+ ucar.nc2.time;resolution:=optional,
+ ucar.nc2.units;resolution:=optional,
+ ucar.nc2.wmo;resolution:=optional,
+ ucar.nc2.write;resolution:=optional,
+ ucar.ma2;resolution:=optional,
+ ucar.grib;resolution:=optional,
+ ucar.grib.grib1;resolution:=optional,
+ ucar.grib.grib2;resolution:=optional,
+ ucar.grid;resolution:=optional,
+ ucar.unidata.geoloc;resolution:=optional,
+ ucar.unidata.geoloc.projection;resolution:=optional,
+ ucar.unidata.geoloc.projection.proj4;resolution:=optional,
+ ucar.unidata.geoloc.projection.sat;resolution:=optional,
+ ucar.unidata.io;resolution:=optional,
+ ucar.unidata.util;resolution:=optional,
+ com.jmatio.io;resolution:=optional,
+ com.sun.jna;resolution:=optional,
+ com.sun.jna.ptr;resolution:=optional,
+ com.sun.xml.bind.marshaller;resolution:=optional,
+ com.sun.xml.internal.bind.marshaller;resolution:=optional,
+ com.sun.msv.datatype;resolution:=optional,
+ com.sun.msv.datatype.xsd;resolution:=optional,
+ com.sun.tools.javadoc;resolution:=optional,
+ sun.misc;resolution:=optional,
+ sun.reflect.generics.reflectiveObjects;resolution:=optional,
+ org.quartz;resolution:=optional,
+ org.quartz.impl;resolution:=optional,
+ icc;resolution:=optional,
+ org.jdom;resolution:=optional,
+ org.jdom.input;resolution:=optional,
+ org.jdom.output;resolution:=optional,
+ org.jdom2;resolution:=optional,
+ org.jdom2.input;resolution:=optional,
+ org.jdom2.input.sax;resolution:=optional,
+ org.jdom2.output;resolution:=optional,
+ org.jdom2.filter;resolution:=optional,
+ javax.measure.converter;resolution:=optional,
+ javax.servlet.annotation;resolution:=optional,
+ javax.servlet;resolution:=optional,
+ javax.servlet.http;resolution:=optional,
+ jj2000.j2k.codestream;resolution:=optional,
+ jj2000.j2k.codestream.reader;resolution:=optional,
+ jj2000.j2k.decoder;resolution:=optional,
+ jj2000.j2k.entropy.decoder;resolution:=optional,
+ jj2000.j2k.fileformat.reader;resolution:=optional,
+ jj2000.j2k.image;resolution:=optional,
+ jj2000.j2k.image.invcomptransf;resolution:=optional,
+ jj2000.j2k.image.output;resolution:=optional,
+ jj2000.j2k.io;resolution:=optional,
+ jj2000.j2k.quantization.dequantizer;resolution:=optional,
+ jj2000.j2k.roi;resolution:=optional,
+ jj2000.j2k.util;resolution:=optional,
+ jj2000.j2k.wavelet.synthesis;resolution:=optional,
+ org.itadaki.bzip2;resolution:=optional,
+ org.jsoup;resolution:=optional,
+ org.jsoup.nodes;resolution:=optional,
+ org.jsoup.select;resolution:=optional,
+ opennlp.maxent;resolution:=optional,
+ opennlp.tools.namefind;resolution:=optional,
+ net.didion.jwnl;resolution:=optional,
+ org.joda.time;resolution:=optional,
+ org.joda.time.chrono;resolution:=optional,
+ org.joda.time.field;resolution:=optional,
+ org.joda.time.format;resolution:=optional,
+ org.apache.http;resolution:=optional,
+ org.apache.http.auth;resolution:=optional,
+ org.apache.http.client;resolution:=optional,
+ org.apache.http.client.entity;resolution:=optional,
+ org.apache.http.client.methods;resolution:=optional,
+ org.apache.http.conn;resolution:=optional,
+ org.apache.http.conn.scheme;resolution:=optional,
+ org.apache.http.cookie;resolution:=optional,
+ org.apache.http.entity;resolution:=optional,
+ org.apache.http.impl.client;resolution:=optional,
+ org.apache.http.impl.conn;resolution:=optional,
+ org.apache.http.message;resolution:=optional,
+ org.apache.http.params;resolution:=optional,
+ org.apache.http.protocol;resolution:=optional,
+ org.apache.http.util;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-text-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-text-bundle/pom.xml b/tika-parser-bundles/tika-parser-text-bundle/pom.xml
index bf4e14a..31d06ac 100644
--- a/tika-parser-bundles/tika-parser-text-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-text-bundle/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-text-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser text bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.text.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-text-module;inline=true,
- juniversalchardet;inline=true,
- commons-codec;inline=true,
- commons-io;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.strings.*,
- org.apache.tika.parser.txt.*,
- org.apache.tika.parser.audio.*,
- org.apache.tika.parser.xml.*
- </Export-Package>
- <Import-Package>
- *,
- javax.servlet.annotation;resolution:=optional,
- javax.servlet;resolution:=optional,
- javax.servlet.http;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-text-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser text bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.text.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-text-module;inline=true,
+ juniversalchardet;inline=true,
+ commons-codec;inline=true,
+ commons-io;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.strings.*,
+ org.apache.tika.parser.txt.*,
+ org.apache.tika.parser.audio.*,
+ org.apache.tika.parser.xml.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ javax.servlet.annotation;resolution:=optional,
+ javax.servlet;resolution:=optional,
+ javax.servlet.http;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-bundles/tika-parser-web-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-web-bundle/pom.xml b/tika-parser-bundles/tika-parser-web-bundle/pom.xml
index 72d22da..a23267d 100644
--- a/tika-parser-bundles/tika-parser-web-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-web-bundle/pom.xml
@@ -1,93 +1,93 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-bundles</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-web-bundle</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parser web bundle</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-web-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-Activator>org.apache.tika.module.web.internal.Activator</Bundle-Activator>
- <Embed-Dependency>
- tika-parser-web-module;inline=true,
- tagsoup;inline=true,
- boilerpipe;inline=true,
- rome;inline=true,
- rome-utils;inline=true,
- apache-mime4j-core;inline=true,
- apache-mime4j-dom;inline=true,
- commons-io;inline=true
- </Embed-Dependency>
- <Embed-Transitive>true</Embed-Transitive>
- <Export-Package>
- org.apache.tika.parser.feed.*,
- org.apache.tika.parser.html.*,
- org.apache.tika.parser.iptc.*,
- org.apache.tika.parser.mail.*
- </Export-Package>
- <Import-Package>
- *,
- org.apache.xerces.parsers;resolution:=optional,
- org.apache.xerces.util;resolution:=optional,
- org.apache.xerces.xni;resolution:=optional,
- org.apache.xerces.xni.parser;resolution:=optional,
- org.cyberneko.html.xercesbridge;resolution:=optional,
- org.jdom;resolution:=optional,
- org.jdom.input;resolution:=optional,
- org.jdom.output;resolution:=optional,
- org.jdom2;resolution:=optional,
- org.jdom2.input;resolution:=optional,
- org.jdom2.input.sax;resolution:=optional,
- org.jdom2.output;resolution:=optional,
- org.jdom2.filter;resolution:=optional,
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-bundles</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-web-bundle</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika parser web bundle</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-web-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-Activator>org.apache.tika.module.web.internal.Activator</Bundle-Activator>
+ <Embed-Dependency>
+ tika-parser-web-module;inline=true,
+ tagsoup;inline=true,
+ boilerpipe;inline=true,
+ rome;inline=true,
+ rome-utils;inline=true,
+ apache-mime4j-core;inline=true,
+ apache-mime4j-dom;inline=true,
+ commons-io;inline=true
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Export-Package>
+ org.apache.tika.parser.feed.*,
+ org.apache.tika.parser.html.*,
+ org.apache.tika.parser.iptc.*,
+ org.apache.tika.parser.mail.*
+ </Export-Package>
+ <Import-Package>
+ *,
+ org.apache.xerces.parsers;resolution:=optional,
+ org.apache.xerces.util;resolution:=optional,
+ org.apache.xerces.xni;resolution:=optional,
+ org.apache.xerces.xni.parser;resolution:=optional,
+ org.cyberneko.html.xercesbridge;resolution:=optional,
+ org.jdom;resolution:=optional,
+ org.jdom.input;resolution:=optional,
+ org.jdom.output;resolution:=optional,
+ org.jdom2;resolution:=optional,
+ org.jdom2.input;resolution:=optional,
+ org.jdom2.input.sax;resolution:=optional,
+ org.jdom2.output;resolution:=optional,
+ org.jdom2.filter;resolution:=optional,
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index eca38f1..6912f8b 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -1,206 +1,206 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>2.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
- </parent>
-
- <artifactId>tika-parser-modules</artifactId>
- <packaging>pom</packaging>
- <name>Apache Tika parser modules</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <poi.version>3.15-beta1</poi.version>
- <!-- NOTE: sync codec version with POI -->
- <codec.version>1.10</codec.version>
- <pdfbox.version>2.0.2</pdfbox.version>
- <jempbox.version>1.8.12</jempbox.version>
- <!-- used by POI, PDFBox and Jackcess ...try to sync -->
- <bouncycastle.version>1.54</bouncycastle.version>
- <commons.exec>1.3</commons.exec>
- </properties>
-
- <modules>
- <module>tika-parser-advanced-module</module>
- <module>tika-parser-cad-module</module>
- <module>tika-parser-code-module</module>
- <module>tika-parser-crypto-module</module>
- <module>tika-parser-database-module</module>
- <module>tika-parser-ebook-module</module>
- <module>tika-parser-journal-module</module>
- <module>tika-parser-multimedia-module</module>
- <module>tika-parser-office-module</module>
- <module>tika-parser-package-module</module>
- <module>tika-parser-pdf-module</module>
- <module>tika-parser-scientific-module</module>
- <module>tika-parser-text-module</module>
- <module>tika-parser-web-module</module>
- <module>tika-parser-xmp-commons</module>
- </modules>
-
- <dependencies>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.core</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.compendium</artifactId>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
- <!-- Test dependencies -->
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <version>1.7</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-junit4</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-container-native</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.framework</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.exam</groupId>
- <artifactId>pax-exam-link-assembly</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.ops4j.pax.url</groupId>
- <artifactId>pax-url-aether</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-simple</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>javax.inject</groupId>
- <artifactId>javax.inject</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
- <build>
- <pluginManagement>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>jar</goal>
- </goals>
- <configuration>
- <useDefaultManifestFile>true</useDefaultManifestFile>
- <includes>
- <include>org/apache/tika/**</include>
- <include>META-INF/**</include>
- </includes>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>integration-test</goal>
- <goal>verify</goal>
- </goals>
- </execution>
- </executions>
- <configuration>
- <systemPropertyVariables>
- <org.ops4j.pax.logging.DefaultServiceLog.level>
- WARN
- </org.ops4j.pax.logging.DefaultServiceLog.level>
- </systemPropertyVariables>
- <systemProperties>
- <property>
- <name>project.bundle.file</name>
- <value>target/${project.build.finalName}-bundle.jar</value>
- </property>
- </systemProperties>
- </configuration>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <executions>
- <execution>
- <phase>pre-integration-test</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <descriptor>test-bundles.xml</descriptor>
- <finalName>test</finalName>
- <attach>false</attach>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-parser-modules</artifactId>
+ <packaging>pom</packaging>
+ <name>Apache Tika parser modules</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <poi.version>3.15-beta1</poi.version>
+ <!-- NOTE: sync codec version with POI -->
+ <codec.version>1.10</codec.version>
+ <pdfbox.version>2.0.2</pdfbox.version>
+ <jempbox.version>1.8.12</jempbox.version>
+ <!-- used by POI, PDFBox and Jackcess ...try to sync -->
+ <bouncycastle.version>1.54</bouncycastle.version>
+ <commons.exec>1.3</commons.exec>
+ </properties>
+
+ <modules>
+ <module>tika-parser-advanced-module</module>
+ <module>tika-parser-cad-module</module>
+ <module>tika-parser-code-module</module>
+ <module>tika-parser-crypto-module</module>
+ <module>tika-parser-database-module</module>
+ <module>tika-parser-ebook-module</module>
+ <module>tika-parser-journal-module</module>
+ <module>tika-parser-multimedia-module</module>
+ <module>tika-parser-office-module</module>
+ <module>tika-parser-package-module</module>
+ <module>tika-parser-pdf-module</module>
+ <module>tika-parser-scientific-module</module>
+ <module>tika-parser-text-module</module>
+ <module>tika-parser-web-module</module>
+ <module>tika-parser-xmp-commons</module>
+ </modules>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.core</artifactId>
+ <scope>provided</scope>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.compendium</artifactId>
+ <scope>provided</scope>
+ <optional>true</optional>
+ </dependency>
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.7</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-junit4</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-container-native</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.framework</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-link-assembly</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.url</groupId>
+ <artifactId>pax-url-aether</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.inject</groupId>
+ <artifactId>javax.inject</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ <configuration>
+ <useDefaultManifestFile>true</useDefaultManifestFile>
+ <includes>
+ <include>org/apache/tika/**</include>
+ <include>META-INF/**</include>
+ </includes>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <systemPropertyVariables>
+ <org.ops4j.pax.logging.DefaultServiceLog.level>
+ WARN
+ </org.ops4j.pax.logging.DefaultServiceLog.level>
+ </systemPropertyVariables>
+ <systemProperties>
+ <property>
+ <name>project.bundle.file</name>
+ <value>target/${project.build.finalName}-bundle.jar</value>
+ </property>
+ </systemProperties>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>pre-integration-test</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptor>test-bundles.xml</descriptor>
+ <finalName>test</finalName>
+ <attach>false</attach>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-advanced-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/pom.xml b/tika-parser-modules/tika-parser-advanced-module/pom.xml
index 2e02904..3263fab 100644
--- a/tika-parser-modules/tika-parser-advanced-module/pom.xml
+++ b/tika-parser-modules/tika-parser-advanced-module/pom.xml
@@ -1,69 +1,69 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-advanced-module</artifactId>
- <name>Apache Tika parser advanced module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>opennlp-tools</artifactId>
- <version>1.5.3</version>
- </dependency>
- <dependency>
- <groupId>org.json</groupId>
- <artifactId>json</artifactId>
- <version>20140107</version>
- </dependency>
- <!-- Apache cTAKES -->
- <dependency>
- <groupId>org.apache.ctakes</groupId>
- <artifactId>ctakes-core</artifactId>
- <version>3.2.2</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-advanced-module</artifactId>
+ <name>Apache Tika parser advanced module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.5.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ <version>20140107</version>
+ </dependency>
+ <!-- Apache cTAKES -->
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
[02/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index fadb6e9..3adaeee 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1,1131 +1,1131 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Geographic;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.LinkContentHandler;
-import org.apache.tika.sax.TeeContentHandler;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.junit.Ignore;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class HtmlParserTest extends TikaTest {
-
- @Test
- public void testParseAscii() throws Exception {
- String path = "/test-documents/testHTML.html";
- final StringWriter href = new StringWriter();
- final StringWriter name = new StringWriter();
- ContentHandler body = new BodyContentHandler();
- Metadata metadata = new Metadata();
- try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
- ContentHandler link = new DefaultHandler() {
- @Override
- public void startElement(
- String u, String l, String n, Attributes a)
- throws SAXException {
- if ("a".equals(l)) {
- if (a.getValue("href") != null) {
- href.append(a.getValue("href"));
- } else if (a.getValue("name") != null) {
- name.append(a.getValue("name"));
- }
- }
- }
- };
- new HtmlParser().parse(
- stream, new TeeContentHandler(body, link),
- metadata, new ParseContext());
- }
-
- assertEquals(
- "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Tika Developers", metadata.get("Author"));
- assertEquals("5", metadata.get("refresh"));
-
- assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
- assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
-
- assertEquals("http://www.apache.org/", href.toString());
- assertEquals("test-anchor", name.toString());
-
- String content = body.toString();
- assertTrue(
- "Did not contain expected text:" + "Test Indexation Html",
- content.contains("Test Indexation Html"));
- assertTrue(
- "Did not contain expected text:" + "Indexation du fichier",
- content.contains("Indexation du fichier"));
- }
-
- @Test
- @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
- public void XtestParseUTF8() throws IOException, SAXException, TikaException {
- String path = "/test-documents/testXHTML_utf8.html";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- HtmlParserTest.class.getResourceAsStream(path), metadata);
-
- assertTrue("Did not contain expected text:"
- + "Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022", content
- .contains("Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022"));
-
- assertTrue("Did not contain expected text:"
- + "Content with UTF-8 chars", content
- .contains("Content with UTF-8 chars"));
-
- assertTrue("Did not contain expected text:" + "\u221a\u2022\u221a�\u221a\u2202", content
- .contains("\u221a\u2022\u221a�\u221a\u2202"));
- }
-
- @Test
- public void testXhtmlParsing() throws Exception {
- String path = "/test-documents/testXHTML.html";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- HtmlParserTest.class.getResourceAsStream(path), metadata);
-
- //can't specify charset because default differs between OS's
- assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
- assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
-
- assertEquals("Tika Developers", metadata.get("Author"));
- assertEquals("5", metadata.get("refresh"));
- assertContains("ability of Apache Tika", content);
- assertContains("extract content", content);
- assertContains("an XHTML document", content);
- }
-
- @Test
- public void testParseEmpty() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- new HtmlParser().parse(
- new ByteArrayInputStream(new byte[0]),
- handler, new Metadata(), new ParseContext());
- assertEquals("", handler.toString());
- }
-
- /**
- * Test case for TIKA-210
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
- */
- @Test
- public void testCharactersDirectlyUnderBodyElement() throws Exception {
- String test = "<html><body>test</body></html>";
- String content = new Tika().parseToString(
- new ByteArrayInputStream(test.getBytes(UTF_8)));
- assertEquals("test", content);
- }
-
- /**
- * Test case for TIKA-287
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
- */
- @Test
- public void testBaseHref() throws Exception {
- assertRelativeLink(
- "http://lucene.apache.org/tika/",
- "http://lucene.apache.org/", "tika/");
-
- assertRelativeLink(
- "http://domain.com/?pid=1",
- "http://domain.com", "?pid=1");
- assertRelativeLink(
- "http://domain.com/?pid=2",
- "http://domain.com?pid=1", "?pid=2");
-
- assertRelativeLink(
- "http://domain.com/file.html",
- "http://domain.com/path/", "/file.html");
- assertRelativeLink(
- "http://domain.com/path/file.html",
- "http://domain.com/path/", "./file.html");
- assertRelativeLink(
- "http://domain.com/path/file.html",
- "http://domain.com/path/", "file.html");
-
- assertRelativeLink(
- "http://domain2.com/newpath",
- "http://domain.com/path/to/file", "http://domain2.com/newpath");
-
- // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
- // Also http://www.ietf.org/rfc/rfc3986.txt
- // Also http://issues.apache.org/jira/browse/NUTCH-566
- // Also http://issues.apache.org/jira/browse/NUTCH-436
- assertRelativeLink(
- "http://domain.com/path/?pid=1",
- "http://domain.com/path/", "?pid=1");
- assertRelativeLink(
- "http://domain.com/file?pid=1",
- "http://domain.com/file", "?pid=1");
- assertRelativeLink(
- "http://domain.com/path/d;p?pid=1",
- "http://domain.com/path/d;p?q#f", "?pid=1");
- }
-
- private void assertRelativeLink(String url, String base, String relative)
- throws Exception {
- String test =
- "<html><head><base href=\"" + base + "\"></head>"
- + "<body><a href=\"" + relative + "\">test</a></body></html>";
- final List<String> links = new ArrayList<String>();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new DefaultHandler() {
- @Override
- public void startElement(
- String u, String l, String name, Attributes atts) {
- if (name.equals("a") && atts.getValue("", "href") != null) {
- links.add(atts.getValue("", "href"));
- }
- }
- },
- new Metadata(),
- new ParseContext());
- assertEquals(1, links.size());
- assertEquals(url, links.get(0));
- }
-
- /**
- * Test case for TIKA-268
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
- */
- @Test
- public void testWhitespaceBetweenTableCells() throws Exception {
- String test =
- "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
- String content = new Tika().parseToString(
- new ByteArrayInputStream(test.getBytes(UTF_8)));
- assertContains("a", content);
- assertContains("b", content);
- assertFalse(content.contains("ab"));
- }
-
- /**
- * Test case for TIKA-332
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
- */
- @Test
- public void testHttpEquivCharset() throws Exception {
- String test =
- "<html><head><meta http-equiv=\"content-type\""
- + " content=\"text/html; charset=ISO-8859-1\" />"
- + "<title>the name is \u00e1ndre</title>"
- + "</head><body></body></html>";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
- * Test case for TIKA-892
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
- */
- @Test
- public void testHtml5Charset() throws Exception {
- String test =
- "<html><head><meta charset=\"ISO-8859-15\" />"
- + "<title>the name is \u00e1ndre</title>"
- + "</head><body></body></html>";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
- * Test case for TIKA-334
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
- */
- @Test
- public void testDetectOfCharset() throws Exception {
- String test =
- "<html><head><title>\u017d</title></head><body></body></html>";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
- }
-
- /**
- * Test case for TIKA-341
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
- */
- @Test
- public void testUsingCharsetInContentTypeHeader() throws Exception {
- final String test =
- "<html><head><title>the name is \u00e1ndre</title></head>"
- + "<body></body></html>";
-
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
-
- metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
- * Test case for HTML content like
- * ">div<foo>br<bar>/div>" that should result
- * in three whitespace-separated tokens "foo", "bar" and "baz" instead
- * of a single token "foobarbaz".
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
- */
- @Test
- public void testLineBreak() throws Exception {
- String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
- String text = new Tika().parseToString(
- new ByteArrayInputStream(test.getBytes(US_ASCII)));
- String[] parts = text.trim().split("\\s+");
- assertEquals(3, parts.length);
- assertEquals("foo", parts[0]);
- assertEquals("bar", parts[1]);
- assertEquals("baz", parts[2]);
- }
-
- /**
- * Test case for TIKA-339: Don't use language returned by CharsetDetector
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
- */
- @Test
- public void testIgnoreCharsetDetectorLanguage() throws Exception {
- String test = "<html><title>Simple Content</title><body></body></html>";
- Metadata metadata = new Metadata();
- metadata.add(Metadata.CONTENT_LANGUAGE, "en");
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
- }
-
- /**
- * Test case for TIKA-349
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
- */
- @Test
- public void testHttpEquivCharsetFunkyAttributes() throws Exception {
- String test1 =
- "<html><head><meta http-equiv=\"content-type\""
- + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />"
- + "<title>the name is \u00e1ndre</title>"
- + "</head><body></body></html>";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
-
- // Some HTML pages have errors like ';;' versus '; ' as separator
- String test2 =
- "<html><head><meta http-equiv=\"content-type\""
- + " content=\"text/html;;charset=ISO-8859-15\" />"
- + "<title>the name is \u00e1ndre</title>"
- + "</head><body></body></html>";
- metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
- * Test case for TIKA-350
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
- */
- @Test
- public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
- final String test =
- "<html><head><title>the name is \u00e1ndre</title></head>"
- + "<body></body></html>";
-
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
-
- metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
-
- /**
- * Test case for TIKA-357
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
- */
- @Test
- public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
- String path = "/test-documents/big-preamble.html";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- HtmlParserTest.class.getResourceAsStream(path),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
- * Test case for TIKA-420
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
- */
- @Test
- public void testBoilerplateRemoval() throws Exception {
- String path = "/test-documents/boilerplate.html";
-
- Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
- new HtmlParser().parse(
- HtmlParserTest.class.getResourceAsStream(path),
- new BoilerpipeContentHandler(handler), metadata, new ParseContext());
-
- String content = handler.toString();
- assertTrue(content.startsWith("This is the real meat"));
- assertTrue(content.endsWith("This is the end of the text.\n"));
- assertFalse(content.contains("boilerplate"));
- assertFalse(content.contains("footer"));
- }
-
- /**
- * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
- */
- @Test
- public void testElementOrdering() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<meta http-equiv=\"content-type\" content=\"text/html\">" +
- "<link rel=\"next\" href=\"next.html\" />" +
- "</head><body><p>Simple Content</p></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // Title element in <head> section
- assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
-
- // No meta elements in body
- assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
-
- // meta elements should show up in <head> section
- assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
-
- // No link elements in body
- assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
-
- // link element should be in <head> section
- assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
-
- // There should be ending elements.
- assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
-
- }
-
- /**
- * Test case for TIKA-463. Don't skip elements that have URLs.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testImgUrlExtraction() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><body><img src=\"image.jpg\" /></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <img> tag should exist, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
- }
-
- /**
- * Test case for TIKA-463. Don't skip elements that have URLs.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testFrameSrcExtraction() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <frame> tag should exist, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
- }
-
- /**
- * Test case for TIKA-463. Don't skip elements that have URLs.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testIFrameSrcExtraction() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
- "<p>Your browser doesn't support iframes!</p></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <iframe> tag should exist, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
- }
-
- /**
- * Test case for TIKA-463. Don't skip elements that have URLs.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testAreaExtraction() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><body><p><map name=\"map\" id=\"map\">" +
- "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
- "</map></p></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <map> tag should exist, with <area> tag with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
- }
-
- /**
- * Test case for TIKA-463. Don't skip elements that have URLs.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testObjectExtraction() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
- "<param name=\"name\" value=\"value\" />" +
- "</object></p></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <object> tag should exist with fully resolved URLs
- assertTrue(
- "<object> tag not correctly found in:\n" + result,
- Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result)
- );
- }
-
- /**
- * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
- */
- @Test
- public void testMetaTagHandling() throws Exception {
- final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";
-
- Metadata metadata = new Metadata();
- metadata.add("Content-Type", "text/html; charset=utf-8");
- metadata.add("Language", null);
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), metadata, new ParseContext());
-
- String result = sw.toString();
-
- // <meta> tag for Content-Type should exist, but nothing for Language
- assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
- assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
- }
-
- /**
- * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
- */
- @Test
- public void testBrokenFrameset() throws Exception {
- final String test1 = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";
-
- StringWriter sw1 = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test1.getBytes(UTF_8)),
- makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
-
- String result = sw1.toString();
-
- // <frame> tag should exist, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
-
- // <body> tag should not exist.
- assertFalse(Pattern.matches("(?s).*<body>.*$", result));
-
- // Test the example from the Nutch project.
- final String test2 = "<html><head><title> my title </title></head><body>" +
- "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
- "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
- "<frame src=\"invalid.html\"/></frame>" +
- "<frame src=\"right.html\"></frame>" +
- "</frameset></frameset></body></html>";
-
- StringWriter sw2 = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test2.getBytes(UTF_8)),
- makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
-
- result = sw2.toString();
-
- // <frame> tags should exist, with relative URL (no base element specified)
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
- assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));
-
- // <body> tag should not exist.
- assertFalse(Pattern.matches("(?s).*<body>.*$", result));
- }
-
- /**
- * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
- * as delegate for BoilerpipeContentHandler
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
- */
- @Test
- public void testBoilerplateDelegation() throws Exception {
- String path = "/test-documents/boilerplate.html";
-
- Metadata metadata = new Metadata();
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- HtmlParserTest.class.getResourceAsStream(path),
- makeHtmlTransformer(sw), metadata, new ParseContext());
-
- String content = sw.toString();
-
- // Should have <html>, <head>, <title>, <body> elements
- assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
- assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
- assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
- assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
- }
-
- /**
- * Test case for TIKA-481. Verify href in <link> is resolved.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
- */
- @Test
- public void testLinkHrefResolution() throws Exception {
- final String test = "<html><head><title>Title</title>" +
- "<base href=\"http://domain.com\" />" +
- "<link rel=\"next\" href=\"next.html\" />" +
- "</head><body></body></html>";
-
- StringWriter sw = new StringWriter();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- makeHtmlTransformer(sw), new Metadata(), new ParseContext());
-
- String result = sw.toString();
-
- // <link> tag should exist in <head>, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
- }
-
-
- /**
- * Create ContentHandler that transforms SAX events into textual HTML output,
- * and writes it out to <writer> - typically this is a StringWriter.
- *
- * @param writer Where to write resulting HTML text.
- * @return ContentHandler suitable for passing to parse() methods.
- * @throws Exception
- */
- private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
- SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
- handler.setResult(new StreamResult(writer));
- return handler;
- }
-
- /**
- * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
- */
- @Test
- public void testBoilerplateWithMarkup() throws Exception {
- String path = "/test-documents/boilerplate.html";
-
- Metadata metadata = new Metadata();
- StringWriter sw = new StringWriter();
- ContentHandler ch = makeHtmlTransformer(sw);
- BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
- bpch.setIncludeMarkup(true);
-
- new HtmlParser().parse(
- HtmlParserTest.class.getResourceAsStream(path),
- bpch, metadata, new ParseContext());
-
- String content = sw.toString();
- assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
- assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
- assertTrue("Has real content", content.contains("<p>This is the real meat"));
- assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
- assertFalse(content.contains("boilerplate"));
- assertFalse(content.contains("footer"));
- }
-
- /**
- * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
- */
- @Test
- public void testPushback() throws IOException, TikaException {
- String content = new Tika().parseToString(
- HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());
- assertNotNull(content);
- }
-
- /**
- * Test case for TIKA-869
- * IdentityHtmlMapper needs to lower-case tag names.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
- */
- @Test
- public void testIdentityMapper() throws Exception {
- final String html = "<html><head><title>Title</title></head>" +
- "<body></body></html>";
- Metadata metadata = new Metadata();
- ParseContext parseContext = new ParseContext();
- parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
-
- StringWriter sw = new StringWriter();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(html.getBytes(UTF_8)),
- makeHtmlTransformer(sw), metadata, parseContext);
-
- String result = sw.toString();
- // Make sure we don't get <body><BODY/></body>
- assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
- }
-
- /**
- * Test case for TIKA-889
- * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
- */
- @Test
- public void testNewlineAndIndent() throws Exception {
- final String html = "<html><head><title>Title</title></head>" +
- "<body><ul><li>one</li></ul></body></html>";
-
- BodyContentHandler handler = new BodyContentHandler();
- new HtmlParser().parse(
- new ByteArrayInputStream(html.getBytes(UTF_8)),
- handler, new Metadata(), new ParseContext());
-
- // Make sure we get <tab>, "one", newline, newline
- String result = handler.toString();
-
- assertTrue(Pattern.matches("\tone\n\n", result));
- }
-
- /**
- * Test case for TIKA-961
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
- */
- @Test
- public void testBoilerplateWhitespace() throws Exception {
- String path = "/test-documents/boilerplate-whitespace.html";
-
- Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
-
- BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
- bpHandler.setIncludeMarkup(true);
-
- new HtmlParser().parse(
- HtmlParserTest.class.getResourceAsStream(path),
- bpHandler, metadata, new ParseContext());
-
- String content = handler.toString();
-
- // Should not contain item_aitem_b
- assertFalse(content.contains("item_aitem_b"));
-
- // Should contain the two list items with a newline in between.
- assertContains("item_a\nitem_b", content);
-
- // Should contain \u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684 (can i help you) without whitespace
- assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", content);
- }
-
- /**
- * Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
- */
- @Test
- public void testOpenGraphMetadata() throws Exception {
- String test1 =
- "<html><head><meta property=\"og:description\""
- + " content=\"some description\" />"
- + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
- + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
- + "<title>hello</title>"
- + "</head><body></body></html>";
- Metadata metadata = new Metadata();
- new HtmlParser().parse(
- new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("some description", metadata.get("og:description"));
- assertTrue(metadata.isMultiValued("og:image"));
- }
-
- // TIKA-1011
- @Test
- public void testUserDefinedCharset() throws Exception {
- String content = new Tika().parseToString(
- HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
- assertNotNull(content);
- }
-
- //TIKA-1001
- @Test
- public void testNoisyMetaCharsetHeaders() throws Exception {
- Tika tika = new Tika();
- String hit = "\u0623\u0639\u0631\u0628";
-
- for (int i = 1; i <= 4; i++) {
- String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
- String content = tika.parseToString(
- HtmlParserTest.class.getResourceAsStream(fileName));
- assertTrue("testing: " + fileName, content.contains(hit));
- }
- }
-
- // TIKA-1193
- @Test
- public void testCustomHtmlSchema() throws Exception {
- // Default schema does not allow tables inside anchors
- String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
-
- Metadata metadata = new Metadata();
- LinkContentHandler linkContentHandler = new LinkContentHandler();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- linkContentHandler, metadata, new ParseContext());
-
- // Expect no anchor text
- assertEquals("", linkContentHandler.getLinks().get(0).getText());
-
- // We'll change the schema to allow tables inside anchors!
- Schema schema = new HTMLSchema();
- schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(Schema.class, schema);
- linkContentHandler = new LinkContentHandler();
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
- linkContentHandler, metadata, parseContext);
-
- // Expect anchor text
- assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
- }
-
- /**
- * Test case for TIKA-820: Locator is unset for HTML parser
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-820">TIKA-820</a>
- */
- @Test
- public void testLocator() throws Exception {
- final int line = 0;
- final int col = 1;
- final int[] textPosition = new int[2];
-
- new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"),
- new ContentHandler() {
- Locator locator;
-
- public void setDocumentLocator(Locator locator) {
- this.locator = locator;
- }
-
- public void startDocument() throws SAXException {
- }
-
- public void endDocument() throws SAXException {
- }
-
- public void startPrefixMapping(String prefix, String uri)
- throws SAXException {
- }
-
- public void endPrefixMapping(String prefix)
- throws SAXException {
- }
-
- public void startElement(String uri, String localName,
- String qName, Attributes atts) throws SAXException {
- }
-
- public void endElement(String uri, String localName,
- String qName) throws SAXException {
- }
-
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- String text = new String(ch, start, length);
- if (text.equals("Test Indexation Html") && locator != null) {
- textPosition[line] = locator.getLineNumber();
- textPosition[col] = locator.getColumnNumber();
- }
- }
-
- public void ignorableWhitespace(char[] ch, int start,
- int length) throws SAXException {
- }
-
- public void processingInstruction(String target, String data)
- throws SAXException {
- }
-
- public void skippedEntity(String name) throws SAXException {
- }
- },
- new Metadata(),
- new ParseContext());
-
- // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
- assertEquals(24, textPosition[line]);
- // The column reported seems fuzzy, just test it is close enough.
- assertTrue(Math.abs(textPosition[col] - 47) < 10);
- }
-
-
- /**
- * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data
- * and ignore any subsequent title tags found in HTML.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-1303">TIKA-1303</a>
- */
- @Test
- public void testFirstTitleValueisSetToMetadata() throws Exception {
- String test = "<html><title>Simple Content</title><body><h1></h1>"
- + "<title>TitleToIgnore</title></body></html>";
- Metadata metadata = new Metadata();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
-
- //Expecting first title to be set in meta data and second one to be ignored.
- assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
- }
-
- @Test
- public void testMisleadingMetaContentTypeTags() throws Exception {
- //TIKA-1519
-
- String test = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\">" +
- "</head><title>title</title><body>body</body></html>";
- Metadata metadata = new Metadata();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
- test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" +
- "</head><title>title</title><body>body</body></html>";
- metadata = new Metadata();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
- //test two content values
- test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\">" +
- "</head><title>title</title><body>body</body></html>";
- metadata = new Metadata();
-
- new HtmlParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("application/ms-word", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
- }
-
- @Test
- public void testXHTMLWithMisleading() throws Exception {
- //first test an acceptable XHTML header with http-equiv tags
- String test = "<?xml version=\"1.0\" ?>" +
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
- "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
- "<head>\n" +
- "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" +
- "<title>title</title></head><body>body</body></html>";
- Metadata metadata = new Metadata();
- new AutoDetectParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
- test = "<?xml version=\"1.0\" ?>" +
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
- "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
- "<head>\n" +
- "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" +
- "<title>title</title></head><body>body</body></html>";
- metadata = new Metadata();
- new AutoDetectParser().parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
- }
-
- @Test
- public void testSkippingCommentsInEncodingDetection() throws Exception {
-
- byte[] bytes = new String("<html><head>" +
- "<!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n" +
- " <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"+
- "</head>"+
- "<body>"+
- "\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684" +
- "</body></html>").getBytes(StandardCharsets.UTF_8);
- EncodingDetector htmlEncodingDetector = new HtmlEncodingDetector();
- XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
- assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class HtmlParserTest extends TikaTest {
+
+ @Test
+ public void testParseAscii() throws Exception {
+ String path = "/test-documents/testHTML.html";
+ final StringWriter href = new StringWriter();
+ final StringWriter name = new StringWriter();
+ ContentHandler body = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
+ ContentHandler link = new DefaultHandler() {
+ @Override
+ public void startElement(
+ String u, String l, String n, Attributes a)
+ throws SAXException {
+ if ("a".equals(l)) {
+ if (a.getValue("href") != null) {
+ href.append(a.getValue("href"));
+ } else if (a.getValue("name") != null) {
+ name.append(a.getValue("name"));
+ }
+ }
+ }
+ };
+ new HtmlParser().parse(
+ stream, new TeeContentHandler(body, link),
+ metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Tika Developers", metadata.get("Author"));
+ assertEquals("5", metadata.get("refresh"));
+
+ assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
+ assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
+
+ assertEquals("http://www.apache.org/", href.toString());
+ assertEquals("test-anchor", name.toString());
+
+ String content = body.toString();
+ assertTrue(
+ "Did not contain expected text:" + "Test Indexation Html",
+ content.contains("Test Indexation Html"));
+ assertTrue(
+ "Did not contain expected text:" + "Indexation du fichier",
+ content.contains("Indexation du fichier"));
+ }
+
+ @Test
+ @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
+ public void XtestParseUTF8() throws IOException, SAXException, TikaException {
+ String path = "/test-documents/testXHTML_utf8.html";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+ assertTrue("Did not contain expected text:"
+ + "Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022", content
+ .contains("Title : Tilte with UTF-8 chars \u221a\u2202\u221a�\u221a\u2022"));
+
+ assertTrue("Did not contain expected text:"
+ + "Content with UTF-8 chars", content
+ .contains("Content with UTF-8 chars"));
+
+ assertTrue("Did not contain expected text:" + "\u221a\u2022\u221a�\u221a\u2202", content
+ .contains("\u221a\u2022\u221a�\u221a\u2202"));
+ }
+
+ @Test
+ public void testXhtmlParsing() throws Exception {
+ String path = "/test-documents/testXHTML.html";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ HtmlParserTest.class.getResourceAsStream(path), metadata);
+
+ //can't specify charset because default differs between OS's
+ assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
+ assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
+
+ assertEquals("Tika Developers", metadata.get("Author"));
+ assertEquals("5", metadata.get("refresh"));
+ assertContains("ability of Apache Tika", content);
+ assertContains("extract content", content);
+ assertContains("an XHTML document", content);
+ }
+
+ @Test
+ public void testParseEmpty() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(new byte[0]),
+ handler, new Metadata(), new ParseContext());
+ assertEquals("", handler.toString());
+ }
+
+ /**
+ * Test case for TIKA-210
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
+ */
+ @Test
+ public void testCharactersDirectlyUnderBodyElement() throws Exception {
+ String test = "<html><body>test</body></html>";
+ String content = new Tika().parseToString(
+ new ByteArrayInputStream(test.getBytes(UTF_8)));
+ assertEquals("test", content);
+ }
+
+ /**
+ * Test case for TIKA-287
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
+ */
+ @Test
+ public void testBaseHref() throws Exception {
+ assertRelativeLink(
+ "http://lucene.apache.org/tika/",
+ "http://lucene.apache.org/", "tika/");
+
+ assertRelativeLink(
+ "http://domain.com/?pid=1",
+ "http://domain.com", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/?pid=2",
+ "http://domain.com?pid=1", "?pid=2");
+
+ assertRelativeLink(
+ "http://domain.com/file.html",
+ "http://domain.com/path/", "/file.html");
+ assertRelativeLink(
+ "http://domain.com/path/file.html",
+ "http://domain.com/path/", "./file.html");
+ assertRelativeLink(
+ "http://domain.com/path/file.html",
+ "http://domain.com/path/", "file.html");
+
+ assertRelativeLink(
+ "http://domain2.com/newpath",
+ "http://domain.com/path/to/file", "http://domain2.com/newpath");
+
+ // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
+ // Also http://www.ietf.org/rfc/rfc3986.txt
+ // Also http://issues.apache.org/jira/browse/NUTCH-566
+ // Also http://issues.apache.org/jira/browse/NUTCH-436
+ assertRelativeLink(
+ "http://domain.com/path/?pid=1",
+ "http://domain.com/path/", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/file?pid=1",
+ "http://domain.com/file", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/path/d;p?pid=1",
+ "http://domain.com/path/d;p?q#f", "?pid=1");
+ }
+
+ private void assertRelativeLink(String url, String base, String relative)
+ throws Exception {
+ String test =
+ "<html><head><base href=\"" + base + "\"></head>"
+ + "<body><a href=\"" + relative + "\">test</a></body></html>";
+ final List<String> links = new ArrayList<String>();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new DefaultHandler() {
+ @Override
+ public void startElement(
+ String u, String l, String name, Attributes atts) {
+ if (name.equals("a") && atts.getValue("", "href") != null) {
+ links.add(atts.getValue("", "href"));
+ }
+ }
+ },
+ new Metadata(),
+ new ParseContext());
+ assertEquals(1, links.size());
+ assertEquals(url, links.get(0));
+ }
+
+ /**
+ * Test case for TIKA-268
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
+ */
+ @Test
+ public void testWhitespaceBetweenTableCells() throws Exception {
+ String test =
+ "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
+ String content = new Tika().parseToString(
+ new ByteArrayInputStream(test.getBytes(UTF_8)));
+ assertContains("a", content);
+ assertContains("b", content);
+ assertFalse(content.contains("ab"));
+ }
+
+ /**
+ * Test case for TIKA-332
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
+ */
+ @Test
+ public void testHttpEquivCharset() throws Exception {
+ String test =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html; charset=ISO-8859-1\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
+ * Test case for TIKA-892
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
+ */
+ @Test
+ public void testHtml5Charset() throws Exception {
+ String test =
+ "<html><head><meta charset=\"ISO-8859-15\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
+ * Test case for TIKA-334
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
+ */
+ @Test
+ public void testDetectOfCharset() throws Exception {
+ String test =
+ "<html><head><title>\u017d</title></head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
+ }
+
+ /**
+ * Test case for TIKA-341
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+ */
+ @Test
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ final String test =
+ "<html><head><title>the name is \u00e1ndre</title></head>"
+ + "<body></body></html>";
+
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
+ * Test case for HTML content like
+ * ">div<foo>br<bar>/div>" that should result
+ * in three whitespace-separated tokens "foo", "bar" and "baz" instead
+ * of a single token "foobarbaz".
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
+ */
+ @Test
+ public void testLineBreak() throws Exception {
+ String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
+ String text = new Tika().parseToString(
+ new ByteArrayInputStream(test.getBytes(US_ASCII)));
+ String[] parts = text.trim().split("\\s+");
+ assertEquals(3, parts.length);
+ assertEquals("foo", parts[0]);
+ assertEquals("bar", parts[1]);
+ assertEquals("baz", parts[2]);
+ }
+
+ /**
+ * Test case for TIKA-339: Don't use language returned by CharsetDetector
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
+ */
+ @Test
+ public void testIgnoreCharsetDetectorLanguage() throws Exception {
+ String test = "<html><title>Simple Content</title><body></body></html>";
+ Metadata metadata = new Metadata();
+ metadata.add(Metadata.CONTENT_LANGUAGE, "en");
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+ }
+
+ /**
+ * Test case for TIKA-349
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
+ */
+ @Test
+ public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+ String test1 =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+
+ // Some HTML pages have errors like ';;' versus '; ' as separator
+ String test2 =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html;;charset=ISO-8859-15\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
+ * Test case for TIKA-350
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
+ */
+ @Test
+ public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
+ final String test =
+ "<html><head><title>the name is \u00e1ndre</title></head>"
+ + "<body></body></html>";
+
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
+ /**
+ * Test case for TIKA-357
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
+ */
+ @Test
+ public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
+ String path = "/test-documents/big-preamble.html";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
+ * Test case for TIKA-420
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
+ */
+ @Test
+ public void testBoilerplateRemoval() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ new BoilerpipeContentHandler(handler), metadata, new ParseContext());
+
+ String content = handler.toString();
+ assertTrue(content.startsWith("This is the real meat"));
+ assertTrue(content.endsWith("This is the end of the text.\n"));
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
+ /**
+ * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
+ */
+ @Test
+ public void testElementOrdering() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<meta http-equiv=\"content-type\" content=\"text/html\">" +
+ "<link rel=\"next\" href=\"next.html\" />" +
+ "</head><body><p>Simple Content</p></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // Title element in <head> section
+ assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
+
+ // No meta elements in body
+ assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
+
+ // meta elements should show up in <head> section
+ assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
+
+ // No link elements in body
+ assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
+
+ // link element should be in <head> section
+ assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
+
+ // There should be ending elements.
+ assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
+
+ }
+
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testImgUrlExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><img src=\"image.jpg\" /></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <img> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
+ }
+
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testFrameSrcExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <frame> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
+ }
+
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testIFrameSrcExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
+ "<p>Your browser doesn't support iframes!</p></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <iframe> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
+ }
+
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testAreaExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><p><map name=\"map\" id=\"map\">" +
+ "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
+ "</map></p></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <map> tag should exist, with <area> tag with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
+ }
+
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testObjectExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
+ "<param name=\"name\" value=\"value\" />" +
+ "</object></p></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <object> tag should exist with fully resolved URLs
+ assertTrue(
+ "<object> tag not correctly found in:\n" + result,
+ Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result)
+ );
+ }
+
+ /**
+ * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ @Test
+ public void testMetaTagHandling() throws Exception {
+ final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";
+
+ Metadata metadata = new Metadata();
+ metadata.add("Content-Type", "text/html; charset=utf-8");
+ metadata.add("Language", null);
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), metadata, new ParseContext());
+
+ String result = sw.toString();
+
+ // <meta> tag for Content-Type should exist, but nothing for Language
+ assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
+ assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
+ }
+
+ /**
+ * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
+ */
+ @Test
+ public void testBrokenFrameset() throws Exception {
+ final String test1 = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";
+
+ StringWriter sw1 = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test1.getBytes(UTF_8)),
+ makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
+
+ String result = sw1.toString();
+
+ // <frame> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
+
+ // <body> tag should not exist.
+ assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+
+ // Test the example from the Nutch project.
+ final String test2 = "<html><head><title> my title </title></head><body>" +
+ "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
+ "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
+ "<frame src=\"invalid.html\"/></frame>" +
+ "<frame src=\"right.html\"></frame>" +
+ "</frameset></frameset></body></html>";
+
+ StringWriter sw2 = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test2.getBytes(UTF_8)),
+ makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
+
+ result = sw2.toString();
+
+ // <frame> tags should exist, with relative URL (no base element specified)
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
+ assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));
+
+ // <body> tag should not exist.
+ assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+ }
+
+ /**
+ * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
+ * as delegate for BoilerpipeContentHandler
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
+ */
+ @Test
+ public void testBoilerplateDelegation() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ makeHtmlTransformer(sw), metadata, new ParseContext());
+
+ String content = sw.toString();
+
+ // Should have <html>, <head>, <title>, <body> elements
+ assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
+ assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
+ assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
+ assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
+ }
+
+ /**
+ * Test case for TIKA-481. Verify href in <link> is resolved.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
+ */
+ @Test
+ public void testLinkHrefResolution() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "<link rel=\"next\" href=\"next.html\" />" +
+ "</head><body></body></html>";
+
+ StringWriter sw = new StringWriter();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <link> tag should exist in <head>, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
+ }
+
+
+ /**
+ * Create ContentHandler that transforms SAX events into textual HTML output,
+ * and writes it out to <writer> - typically this is a StringWriter.
+ *
+ * @param writer Where to write resulting HTML text.
+ * @return ContentHandler suitable for passing to parse() methods.
+ * @throws Exception
+ */
+ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
+ SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
+ handler.setResult(new StreamResult(writer));
+ return handler;
+ }
+
+ /**
+ * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
+ */
+ @Test
+ public void testBoilerplateWithMarkup() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ ContentHandler ch = makeHtmlTransformer(sw);
+ BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+ bpch.setIncludeMarkup(true);
+
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ bpch, metadata, new ParseContext());
+
+ String content = sw.toString();
+ assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
+ assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
+ assertTrue("Has real content", content.contains("<p>This is the real meat"));
+ assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
+ /**
+ * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
+ */
+ @Test
+ public void testPushback() throws IOException, TikaException {
+ String content = new Tika().parseToString(
+ HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());
+ assertNotNull(content);
+ }
+
+ /**
+ * Test case for TIKA-869
+ * IdentityHtmlMapper needs to lower-case tag names.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
+ */
+ @Test
+ public void testIdentityMapper() throws Exception {
+ final String html = "<html><head><title>Title</title></head>" +
+ "<body></body></html>";
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+
+ StringWriter sw = new StringWriter();
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(html.getBytes(UTF_8)),
+ makeHtmlTransformer(sw), metadata, parseContext);
+
+ String result = sw.toString();
+ // Make sure we don't get <body><BODY/></body>
+ assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
+ }
+
+ /**
+ * Test case for TIKA-889
+ * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
+ */
+ @Test
+ public void testNewlineAndIndent() throws Exception {
+ final String html = "<html><head><title>Title</title></head>" +
+ "<body><ul><li>one</li></ul></body></html>";
+
+ BodyContentHandler handler = new BodyContentHandler();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(html.getBytes(UTF_8)),
+ handler, new Metadata(), new ParseContext());
+
+ // Make sure we get <tab>, "one", newline, newline
+ String result = handler.toString();
+
+ assertTrue(Pattern.matches("\tone\n\n", result));
+ }
+
+ /**
+ * Test case for TIKA-961
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+ */
+ @Test
+ public void testBoilerplateWhitespace() throws Exception {
+ String path = "/test-documents/boilerplate-whitespace.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+ bpHandler.setIncludeMarkup(true);
+
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ bpHandler, metadata, new ParseContext());
+
+ String content = handler.toString();
+
+ // Should not contain item_aitem_b
+ assertFalse(content.contains("item_aitem_b"));
+
+ // Should contain the two list items with a newline in between.
+ assertContains("item_a\nitem_b", content);
+
+ // Should contain \u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684 (can i help you) without whitespace
+ assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", content);
+ }
+
+ /**
+ * Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
+ */
+ @Test
+ public void testOpenGraphMetadata() throws Exception {
+ String test1 =
+ "<html><head><meta property=\"og:description\""
+ + " content=\"some description\" />"
+ + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
+ + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
+ + "<title>hello</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("some description", metadata.get("og:description"));
+ assertTrue(metadata.isMultiValued("og:image"));
+ }
+
+ // TIKA-1011
+ @Test
+ public void testUserDefinedCharset() throws Exception {
+ String content = new Tika().parseToString(
+ HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
+ assertNotNull(content);
+ }
+
+ //TIKA-1001
+ @Test
+ public void testNoisyMetaCharsetHeaders() throws Exception {
+ Tika tika = new Tika();
+ String hit = "\u0623\u0639\u0631\u0628";
+
+ for (int i = 1; i <= 4; i++) {
+ String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
+ String content = tika.parseToString(
+ HtmlParserTest.class.getResourceAsStream(fileName));
+ assertTrue("testing: " + fileName, content.contains(hit));
+ }
+ }
+
+ //
<TRUNCATED>
[12/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 973d9da..443eb9e 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -1,287 +1,287 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Date;
-import java.util.Set;
-
-import org.apache.commons.compress.PasswordRequiredException;
-import org.apache.commons.compress.archivers.ArchiveEntry;
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.StreamingNotSupportedException;
-import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
-import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
-import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
-import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
-import org.apache.commons.compress.archivers.sevenz.SevenZFile;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Parser for various packaging formats. Package entries will be written to
- * the XHTML event stream as <div class="package-entry"> elements that
- * contain the (optional) entry name as a <h1> element and the full
- * structured body content of the parsed entry.
- * <p>
- * User must have JCE Unlimited Strength jars installed for encryption to
- * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars
- * are not installed, an IOException will be thrown, and potentially
- * wrapped in a TikaException.
- */
-public class PackageParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -5331043266963888708L;
-
- private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
- private static final MediaType JAR = MediaType.application("java-archive");
- private static final MediaType AR = MediaType.application("x-archive");
- private static final MediaType CPIO = MediaType.application("x-cpio");
- private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
- private static final MediaType TAR = MediaType.application("x-tar");
- private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
-
- static MediaType getMediaType(ArchiveInputStream stream) {
- if (stream instanceof JarArchiveInputStream) {
- return JAR;
- } else if (stream instanceof ZipArchiveInputStream) {
- return ZIP;
- } else if (stream instanceof ArArchiveInputStream) {
- return AR;
- } else if (stream instanceof CpioArchiveInputStream) {
- return CPIO;
- } else if (stream instanceof DumpArchiveInputStream) {
- return DUMP;
- } else if (stream instanceof TarArchiveInputStream) {
- return TAR;
- } else if (stream instanceof SevenZWrapper) {
- return SEVENZ;
- } else {
- return MediaType.OCTET_STREAM;
- }
- }
-
- static boolean isZipArchive(MediaType type) {
- return type.equals(ZIP) || type.equals(JAR);
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- // Ensure that the stream supports the mark feature
- if (! TikaInputStream.isTikaInputStream(stream))
- stream = new BufferedInputStream(stream);
-
-
- TemporaryResources tmp = new TemporaryResources();
- ArchiveInputStream ais = null;
- try {
- ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
- // At the end we want to close the archive stream to release
- // any associated resources, but the underlying document stream
- // should not be closed
- ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
-
- } catch (StreamingNotSupportedException sne) {
- // Most archive formats work on streams, but a few need files
- if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
- // Rework as a file, and wrap
- stream.reset();
- TikaInputStream tstream = TikaInputStream.get(stream, tmp);
-
- // Seven Zip suports passwords, was one given?
- String password = null;
- PasswordProvider provider = context.get(PasswordProvider.class);
- if (provider != null) {
- password = provider.getPassword(metadata);
- }
-
- SevenZFile sevenz;
- if (password == null) {
- sevenz = new SevenZFile(tstream.getFile());
- } else {
- sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
- }
-
- // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
- ais = new SevenZWrapper(sevenz);
- } else {
- tmp.close();
- throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
- }
- } catch (ArchiveException e) {
- tmp.close();
- throw new TikaException("Unable to unpack document stream", e);
- }
-
- MediaType type = getMediaType(ais);
- if (!type.equals(MediaType.OCTET_STREAM)) {
- metadata.set(CONTENT_TYPE, type.toString());
- }
- // Use the delegate parser to parse the contained document
- EmbeddedDocumentExtractor extractor = context.get(
- EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- try {
- ArchiveEntry entry = ais.getNextEntry();
- while (entry != null) {
- if (!entry.isDirectory()) {
- parseEntry(ais, entry, extractor, xhtml);
- }
- entry = ais.getNextEntry();
- }
- } catch (UnsupportedZipFeatureException zfe) {
- // If it's an encrypted document of unknown password, report as such
- if (zfe.getFeature() == Feature.ENCRYPTION) {
- throw new EncryptedDocumentException(zfe);
- }
- // Otherwise fall through to raise the exception as normal
- } catch (PasswordRequiredException pre) {
- throw new EncryptedDocumentException(pre);
- } finally {
- ais.close();
- tmp.close();
- }
-
- xhtml.endDocument();
- }
-
- private void parseEntry(
- ArchiveInputStream archive, ArchiveEntry entry,
- EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
- throws SAXException, IOException, TikaException {
- String name = entry.getName();
- if (archive.canReadEntryData(entry)) {
- // Fetch the metadata on the entry contained in the archive
- Metadata entrydata = handleEntryMetadata(name, null,
- entry.getLastModifiedDate(), entry.getSize(), xhtml);
-
- // Recurse into the entry if desired
- if (extractor.shouldParseEmbedded(entrydata)) {
- // For detectors to work, we need a mark/reset supporting
- // InputStream, which ArchiveInputStream isn't, so wrap
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(archive, tmp);
- extractor.parseEmbedded(tis, xhtml, entrydata, true);
- } finally {
- tmp.dispose();
- }
- }
- } else if (name != null && name.length() > 0) {
- xhtml.element("p", name);
- }
- }
-
- protected static Metadata handleEntryMetadata(
- String name, Date createAt, Date modifiedAt,
- Long size, XHTMLContentHandler xhtml)
- throws SAXException, IOException, TikaException {
- Metadata entrydata = new Metadata();
- if (createAt != null) {
- entrydata.set(TikaCoreProperties.CREATED, createAt);
- }
- if (modifiedAt != null) {
- entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
- }
- if (size != null) {
- entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
- }
- if (name != null && name.length() > 0) {
- name = name.replace("\\", "/");
- entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", name);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
- }
- return entrydata;
- }
-
- // Pending a fix for COMPRESS-269, we have to wrap ourselves
- private static class SevenZWrapper extends ArchiveInputStream {
- private SevenZFile file;
- private SevenZWrapper(SevenZFile file) {
- this.file = file;
- }
-
- @Override
- public int read() throws IOException {
- return file.read();
- }
- @Override
- public int read(byte[] b) throws IOException {
- return file.read(b);
- }
- @Override
- public int read(byte[] b, int off, int len) throws IOException {
- return file.read(b, off, len);
- }
-
- @Override
- public ArchiveEntry getNextEntry() throws IOException {
- return file.getNextEntry();
- }
-
- @Override
- public void close() throws IOException {
- file.close();
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.PasswordRequiredException;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for various packaging formats. Package entries will be written to
+ * the XHTML event stream as <div class="package-entry"> elements that
+ * contain the (optional) entry name as a <h1> element and the full
+ * structured body content of the parsed entry.
+ * <p>
+ * User must have JCE Unlimited Strength jars installed for encryption to
+ * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars
+ * are not installed, an IOException will be thrown, and potentially
+ * wrapped in a TikaException.
+ */
+public class PackageParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -5331043266963888708L;
+
+ private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
+ private static final MediaType JAR = MediaType.application("java-archive");
+ private static final MediaType AR = MediaType.application("x-archive");
+ private static final MediaType CPIO = MediaType.application("x-cpio");
+ private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
+ private static final MediaType TAR = MediaType.application("x-tar");
+ private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+
+ static MediaType getMediaType(ArchiveInputStream stream) {
+ if (stream instanceof JarArchiveInputStream) {
+ return JAR;
+ } else if (stream instanceof ZipArchiveInputStream) {
+ return ZIP;
+ } else if (stream instanceof ArArchiveInputStream) {
+ return AR;
+ } else if (stream instanceof CpioArchiveInputStream) {
+ return CPIO;
+ } else if (stream instanceof DumpArchiveInputStream) {
+ return DUMP;
+ } else if (stream instanceof TarArchiveInputStream) {
+ return TAR;
+ } else if (stream instanceof SevenZWrapper) {
+ return SEVENZ;
+ } else {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ static boolean isZipArchive(MediaType type) {
+ return type.equals(ZIP) || type.equals(JAR);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Ensure that the stream supports the mark feature
+ if (! TikaInputStream.isTikaInputStream(stream))
+ stream = new BufferedInputStream(stream);
+
+
+ TemporaryResources tmp = new TemporaryResources();
+ ArchiveInputStream ais = null;
+ try {
+ ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+ // At the end we want to close the archive stream to release
+ // any associated resources, but the underlying document stream
+ // should not be closed
+ ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
+
+ } catch (StreamingNotSupportedException sne) {
+ // Most archive formats work on streams, but a few need files
+ if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+ // Rework as a file, and wrap
+ stream.reset();
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+
+ // Seven Zip suports passwords, was one given?
+ String password = null;
+ PasswordProvider provider = context.get(PasswordProvider.class);
+ if (provider != null) {
+ password = provider.getPassword(metadata);
+ }
+
+ SevenZFile sevenz;
+ if (password == null) {
+ sevenz = new SevenZFile(tstream.getFile());
+ } else {
+ sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
+ }
+
+ // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
+ ais = new SevenZWrapper(sevenz);
+ } else {
+ tmp.close();
+ throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+ }
+ } catch (ArchiveException e) {
+ tmp.close();
+ throw new TikaException("Unable to unpack document stream", e);
+ }
+
+ MediaType type = getMediaType(ais);
+ if (!type.equals(MediaType.OCTET_STREAM)) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ }
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor extractor = context.get(
+ EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ try {
+ ArchiveEntry entry = ais.getNextEntry();
+ while (entry != null) {
+ if (!entry.isDirectory()) {
+ parseEntry(ais, entry, extractor, xhtml);
+ }
+ entry = ais.getNextEntry();
+ }
+ } catch (UnsupportedZipFeatureException zfe) {
+ // If it's an encrypted document of unknown password, report as such
+ if (zfe.getFeature() == Feature.ENCRYPTION) {
+ throw new EncryptedDocumentException(zfe);
+ }
+ // Otherwise fall through to raise the exception as normal
+ } catch (PasswordRequiredException pre) {
+ throw new EncryptedDocumentException(pre);
+ } finally {
+ ais.close();
+ tmp.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseEntry(
+ ArchiveInputStream archive, ArchiveEntry entry,
+ EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ String name = entry.getName();
+ if (archive.canReadEntryData(entry)) {
+ // Fetch the metadata on the entry contained in the archive
+ Metadata entrydata = handleEntryMetadata(name, null,
+ entry.getLastModifiedDate(), entry.getSize(), xhtml);
+
+ // Recurse into the entry if desired
+ if (extractor.shouldParseEmbedded(entrydata)) {
+ // For detectors to work, we need a mark/reset supporting
+ // InputStream, which ArchiveInputStream isn't, so wrap
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(archive, tmp);
+ extractor.parseEmbedded(tis, xhtml, entrydata, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ } else if (name != null && name.length() > 0) {
+ xhtml.element("p", name);
+ }
+ }
+
+ protected static Metadata handleEntryMetadata(
+ String name, Date createAt, Date modifiedAt,
+ Long size, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ Metadata entrydata = new Metadata();
+ if (createAt != null) {
+ entrydata.set(TikaCoreProperties.CREATED, createAt);
+ }
+ if (modifiedAt != null) {
+ entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+ }
+ if (size != null) {
+ entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+ }
+ if (name != null && name.length() > 0) {
+ name = name.replace("\\", "/");
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", name);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+ }
+ return entrydata;
+ }
+
+ // Pending a fix for COMPRESS-269, we have to wrap ourselves
+ private static class SevenZWrapper extends ArchiveInputStream {
+ private SevenZFile file;
+ private SevenZWrapper(SevenZFile file) {
+ this.file = file;
+ }
+
+ @Override
+ public int read() throws IOException {
+ return file.read();
+ }
+ @Override
+ public int read(byte[] b) throws IOException {
+ return file.read(b);
+ }
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ return file.read(b, off, len);
+ }
+
+ @Override
+ public ArchiveEntry getNextEntry() throws IOException {
+ return file.getNextEntry();
+ }
+
+ @Override
+ public void close() throws IOException {
+ file.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 8276e9a..0a12e15 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -1,324 +1,324 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorInputStream;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.detect.AbstractDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.iwork.IWorkPackageParser;
-import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * A detector that works on Zip documents and other archive and compression
- * formats to figure out exactly what the file is.
- */
-public class ZipContainerDetector extends AbstractDetector {
-
- /** Serial version UID */
- private static final long serialVersionUID = 2891763938430295453L;
-
- private final Detector opcDetector;
-
- public ZipContainerDetector() {
- this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
- }
-
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- // Check if we have access to the document
- if (input == null) {
- return MediaType.OCTET_STREAM;
- }
-
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(input, tmp);
-
- byte[] prefix = new byte[1024]; // enough for all known formats
- int length = tis.peek(prefix);
-
- MediaType type = detectArchiveFormat(prefix, length);
- if (PackageParser.isZipArchive(type)
- && TikaInputStream.isTikaInputStream(input)) {
- return detectZipFormat(tis);
- } else if (!type.equals(MediaType.OCTET_STREAM)) {
- return type;
- } else {
- return detectCompressorFormat(prefix, length);
- }
- } finally {
- try {
- tmp.dispose();
- } catch (TikaException e) {
- // ignore
- }
- }
- }
-
- private static MediaType detectCompressorFormat(byte[] prefix, int length) {
- try {
- CompressorStreamFactory factory = new CompressorStreamFactory();
- CompressorInputStream cis = factory.createCompressorInputStream(
- new ByteArrayInputStream(prefix, 0, length));
- try {
- return CompressorParser.getMediaType(cis);
- } finally {
- IOUtils.closeQuietly(cis);
- }
- } catch (CompressorException e) {
- return MediaType.OCTET_STREAM;
- }
- }
-
- private static MediaType detectArchiveFormat(byte[] prefix, int length) {
- try {
- ArchiveStreamFactory factory = new ArchiveStreamFactory();
- ArchiveInputStream ais = factory.createArchiveInputStream(
- new ByteArrayInputStream(prefix, 0, length));
- try {
- if ((ais instanceof TarArchiveInputStream)
- && !TarArchiveInputStream.matches(prefix, length)) {
- // ArchiveStreamFactory is too relaxed, see COMPRESS-117
- return MediaType.OCTET_STREAM;
- } else {
- return PackageParser.getMediaType(ais);
- }
- } finally {
- IOUtils.closeQuietly(ais);
- }
- } catch (ArchiveException e) {
- return MediaType.OCTET_STREAM;
- }
- }
-
- private MediaType detectZipFormat(TikaInputStream tis) {
- try {
- ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
- try {
- MediaType type = detectOpenDocument(zip);
- if (type == null) {
- type = detectOPCBased(zip, tis);
- }
- if (type == null) {
- type = detectIWork(zip);
- }
- if (type == null) {
- type = detectJar(zip);
- }
- if (type == null) {
- type = detectKmz(zip);
- }
- if (type == null) {
- type = detectIpa(zip);
- }
- if (type != null) {
- return type;
- }
- } finally {
- // TODO: shouldn't we record the open
- // container so it can be later
- // reused...?
- // tis.setOpenContainer(zip);
- try {
- zip.close();
- } catch (IOException e) {
- // ignore
- }
- }
- } catch (IOException e) {
- // ignore
- }
- // Fallback: it's still a zip file, we just don't know what kind of one
- return MediaType.APPLICATION_ZIP;
- }
-
- /**
- * OpenDocument files, along with EPub files and ASiC ones, have a
- * mimetype entry in the root of their Zip file. This entry contains
- * the mimetype of the overall file, stored as a single string.
- */
- private static MediaType detectOpenDocument(ZipFile zip) {
- try {
- ZipArchiveEntry mimetype = zip.getEntry("mimetype");
- if (mimetype != null) {
- try (InputStream stream = zip.getInputStream(mimetype)) {
- return MediaType.parse(IOUtils.toString(stream, UTF_8));
- }
- } else {
- return null;
- }
- } catch (IOException e) {
- return null;
- }
- }
-
- private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
- try {
- if (zip.getEntry("_rels/.rels") != null
- || zip.getEntry("[Content_Types].xml") != null) {
- MediaType type = this.opcDetector.detect(stream, null);
- if (type != null) return type;
-
- // We don't know what it is, sorry
- return null;
- } else {
- return null;
- }
- } catch (IOException e) {
- return null;
- } catch (RuntimeException e) {
- return null;
- }
- }
-
-
- private static MediaType detectIWork(ZipFile zip) {
- if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
- // Locate the appropriate index file entry, and reads from that
- // the root element of the document. That is used to the identify
- // the correct type of the keynote container.
- for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
- IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip);
- if (type != null) {
- return type.getType();
- }
- }
-
- // Not sure, fallback to the container type
- return MediaType.application("vnd.apple.iwork");
- } else {
- return null;
- }
- }
-
- private static MediaType detectJar(ZipFile zip) {
- if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
- // It's a Jar file, or something based on Jar
-
- // Is it an Android APK?
- if (zip.getEntry("AndroidManifest.xml") != null) {
- return MediaType.application("vnd.android.package-archive");
- }
-
- // Check for WAR and EAR
- if (zip.getEntry("WEB-INF/") != null) {
- return MediaType.application("x-tika-java-web-archive");
- }
- if (zip.getEntry("META-INF/application.xml") != null) {
- return MediaType.application("x-tika-java-enterprise-archive");
- }
-
- // Looks like a regular Jar Archive
- return MediaType.application("java-archive");
- } else {
- // Some Android APKs miss the default Manifest
- if (zip.getEntry("AndroidManifest.xml") != null) {
- return MediaType.application("vnd.android.package-archive");
- }
-
- return null;
- }
- }
-
- private static MediaType detectKmz(ZipFile zip) {
- boolean kmlFound = false;
-
- Enumeration<ZipArchiveEntry> entries = zip.getEntries();
- while (entries.hasMoreElements()) {
- ZipArchiveEntry entry = entries.nextElement();
- String name = entry.getName();
- if (!entry.isDirectory()
- && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
- if (name.endsWith(".kml") && !kmlFound) {
- kmlFound = true;
- } else {
- return null;
- }
- }
- }
-
- if (kmlFound) {
- return MediaType.application("vnd.google-earth.kmz");
- } else {
- return null;
- }
- }
-
- /**
- * To be considered as an IPA file, it needs to match all of these
- */
- private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
- private static final long serialVersionUID = 6545295886322115362L;
- {
- add(Pattern.compile("^Payload/$"));
- add(Pattern.compile("^Payload/.*\\.app/$"));
- add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
- add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
- add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
- add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
- }};
- @SuppressWarnings("unchecked")
- private static MediaType detectIpa(ZipFile zip) {
- // Note - consider generalising this logic, if another format needs many regexp matching
- Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
-
- Enumeration<ZipArchiveEntry> entries = zip.getEntries();
- while (entries.hasMoreElements()) {
- ZipArchiveEntry entry = entries.nextElement();
- String name = entry.getName();
-
- Iterator<Pattern> ip = tmpPatterns.iterator();
- while (ip.hasNext()) {
- if (ip.next().matcher(name).matches()) {
- ip.remove();
- }
- }
- if (tmpPatterns.isEmpty()) {
- // We've found everything we need to find
- return MediaType.application("x-itunes-ipa");
- }
- }
-
- // If we get here, not all required entries were found
- return null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.detect.AbstractDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * A detector that works on Zip documents and other archive and compression
+ * formats to figure out exactly what the file is.
+ */
+public class ZipContainerDetector extends AbstractDetector {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 2891763938430295453L;
+
+ private final Detector opcDetector;
+
+ public ZipContainerDetector() {
+ this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector");
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // Check if we have access to the document
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(input, tmp);
+
+ byte[] prefix = new byte[1024]; // enough for all known formats
+ int length = tis.peek(prefix);
+
+ MediaType type = detectArchiveFormat(prefix, length);
+ if (PackageParser.isZipArchive(type)
+ && TikaInputStream.isTikaInputStream(input)) {
+ return detectZipFormat(tis);
+ } else if (!type.equals(MediaType.OCTET_STREAM)) {
+ return type;
+ } else {
+ return detectCompressorFormat(prefix, length);
+ }
+ } finally {
+ try {
+ tmp.dispose();
+ } catch (TikaException e) {
+ // ignore
+ }
+ }
+ }
+
+ private static MediaType detectCompressorFormat(byte[] prefix, int length) {
+ try {
+ CompressorStreamFactory factory = new CompressorStreamFactory();
+ CompressorInputStream cis = factory.createCompressorInputStream(
+ new ByteArrayInputStream(prefix, 0, length));
+ try {
+ return CompressorParser.getMediaType(cis);
+ } finally {
+ IOUtils.closeQuietly(cis);
+ }
+ } catch (CompressorException e) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+ try {
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
+ ArchiveInputStream ais = factory.createArchiveInputStream(
+ new ByteArrayInputStream(prefix, 0, length));
+ try {
+ if ((ais instanceof TarArchiveInputStream)
+ && !TarArchiveInputStream.matches(prefix, length)) {
+ // ArchiveStreamFactory is too relaxed, see COMPRESS-117
+ return MediaType.OCTET_STREAM;
+ } else {
+ return PackageParser.getMediaType(ais);
+ }
+ } finally {
+ IOUtils.closeQuietly(ais);
+ }
+ } catch (ArchiveException e) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ private MediaType detectZipFormat(TikaInputStream tis) {
+ try {
+ ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+ try {
+ MediaType type = detectOpenDocument(zip);
+ if (type == null) {
+ type = detectOPCBased(zip, tis);
+ }
+ if (type == null) {
+ type = detectIWork(zip);
+ }
+ if (type == null) {
+ type = detectJar(zip);
+ }
+ if (type == null) {
+ type = detectKmz(zip);
+ }
+ if (type == null) {
+ type = detectIpa(zip);
+ }
+ if (type != null) {
+ return type;
+ }
+ } finally {
+ // TODO: shouldn't we record the open
+ // container so it can be later
+ // reused...?
+ // tis.setOpenContainer(zip);
+ try {
+ zip.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ // Fallback: it's still a zip file, we just don't know what kind of one
+ return MediaType.APPLICATION_ZIP;
+ }
+
+ /**
+ * OpenDocument files, along with EPub files and ASiC ones, have a
+ * mimetype entry in the root of their Zip file. This entry contains
+ * the mimetype of the overall file, stored as a single string.
+ */
+ private static MediaType detectOpenDocument(ZipFile zip) {
+ try {
+ ZipArchiveEntry mimetype = zip.getEntry("mimetype");
+ if (mimetype != null) {
+ try (InputStream stream = zip.getInputStream(mimetype)) {
+ return MediaType.parse(IOUtils.toString(stream, UTF_8));
+ }
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
+ try {
+ if (zip.getEntry("_rels/.rels") != null
+ || zip.getEntry("[Content_Types].xml") != null) {
+ MediaType type = this.opcDetector.detect(stream, null);
+ if (type != null) return type;
+
+ // We don't know what it is, sorry
+ return null;
+ } else {
+ return null;
+ }
+ } catch (IOException e) {
+ return null;
+ } catch (RuntimeException e) {
+ return null;
+ }
+ }
+
+
+ private static MediaType detectIWork(ZipFile zip) {
+ if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
+ // Locate the appropriate index file entry, and reads from that
+ // the root element of the document. That is used to the identify
+ // the correct type of the keynote container.
+ for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
+ IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip);
+ if (type != null) {
+ return type.getType();
+ }
+ }
+
+ // Not sure, fallback to the container type
+ return MediaType.application("vnd.apple.iwork");
+ } else {
+ return null;
+ }
+ }
+
+ private static MediaType detectJar(ZipFile zip) {
+ if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
+ // It's a Jar file, or something based on Jar
+
+ // Is it an Android APK?
+ if (zip.getEntry("AndroidManifest.xml") != null) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ // Check for WAR and EAR
+ if (zip.getEntry("WEB-INF/") != null) {
+ return MediaType.application("x-tika-java-web-archive");
+ }
+ if (zip.getEntry("META-INF/application.xml") != null) {
+ return MediaType.application("x-tika-java-enterprise-archive");
+ }
+
+ // Looks like a regular Jar Archive
+ return MediaType.application("java-archive");
+ } else {
+ // Some Android APKs miss the default Manifest
+ if (zip.getEntry("AndroidManifest.xml") != null) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ return null;
+ }
+ }
+
+ private static MediaType detectKmz(ZipFile zip) {
+ boolean kmlFound = false;
+
+ Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ String name = entry.getName();
+ if (!entry.isDirectory()
+ && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
+ if (name.endsWith(".kml") && !kmlFound) {
+ kmlFound = true;
+ } else {
+ return null;
+ }
+ }
+ }
+
+ if (kmlFound) {
+ return MediaType.application("vnd.google-earth.kmz");
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * To be considered as an IPA file, it needs to match all of these
+ */
+ private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+ private static final long serialVersionUID = 6545295886322115362L;
+ {
+ add(Pattern.compile("^Payload/$"));
+ add(Pattern.compile("^Payload/.*\\.app/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+ add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+ add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+ }};
+ @SuppressWarnings("unchecked")
+ private static MediaType detectIpa(ZipFile zip) {
+ // Note - consider generalising this logic, if another format needs many regexp matching
+ Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+
+ Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ String name = entry.getName();
+
+ Iterator<Pattern> ip = tmpPatterns.iterator();
+ while (ip.hasNext()) {
+ if (ip.next().matcher(name).matches()) {
+ ip.remove();
+ }
+ }
+ if (tmpPatterns.isEmpty()) {
+ // We've found everything we need to find
+ return MediaType.application("x-itunes-ipa");
+ }
+ }
+
+ // If we get here, not all required entries were found
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
index e7625b4..25dfc44 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.Test;
-
-/**
- * Test class for the <code>AutoPageNumberUtils</code> helper class.
- */
-public class AutoPageNumberUtilsTest {
-
- /**
- * Check upper-case alpha-numeric numbers are generated based on the
- * input page number.
- */
- @Test
- public void testAlphaUpper() {
- assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
- assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
- assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
- assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
- assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
- assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
- }
-
- /**
- * Check lower-case alpha-numeric numbers are generated based on the
- * input page number.
- */
- @Test
- public void testAlphaLower() {
- assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
- assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
- assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
- assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
- assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
- assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
- }
-
- /**
- * Check upper-case Roman numerals numbers are generated based on the
- * input page number.
- */
- @Test
- public void testRomanUpper() {
- assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
- assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
- assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
- }
-
- /**
- * Check lower-case Roman numerals numbers are generated based on the
- * input page number.
- */
- @Test
- public void testRomanLower() {
- assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
- assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
- assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Test class for the <code>AutoPageNumberUtils</code> helper class.
+ */
+public class AutoPageNumberUtilsTest {
+
+ /**
+ * Check upper-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testAlphaUpper() {
+ assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+ assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+ assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+ assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+ assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+ assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+ }
+
+ /**
+ * Check lower-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testAlphaLower() {
+ assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+ assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+ assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+ assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+ assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+ assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+ }
+
+ /**
+ * Check upper-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testRomanUpper() {
+ assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+ assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+ assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+ }
+
+ /**
+ * Check lower-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
+ @Test
+ public void testRomanLower() {
+ assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+ assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+ assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+ }
+
+}
[22/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index 75b556c..a32d406 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -1,496 +1,496 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
-import javax.xml.namespace.QName;
-import javax.xml.parsers.SAXParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.Stack;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.ElementMappingContentHandler;
-import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Parser for ODF <code>content.xml</code> files.
- */
-public class OpenDocumentContentParser extends AbstractParser {
- private interface Style {
- }
-
- private static class TextStyle implements Style {
- public boolean italic;
- public boolean bold;
- public boolean underlined;
- }
-
- private static class ListStyle implements Style {
- public boolean ordered;
-
- public String getTag() {
- return ordered ? "ol" : "ul";
- }
- }
-
- private static final class OpenDocumentElementMappingContentHandler extends
- ElementMappingContentHandler {
- private final ContentHandler handler;
- private final BitSet textNodeStack = new BitSet();
- private int nodeDepth = 0;
- private int completelyFiltered = 0;
- private Stack<String> headingStack = new Stack<String>();
- private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
- private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
- private TextStyle textStyle;
- private TextStyle lastTextStyle;
- private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
- private ListStyle listStyle;
-
- private OpenDocumentElementMappingContentHandler(ContentHandler handler,
- Map<QName, TargetElement> mappings) {
- super(handler, mappings);
- this.handler = handler;
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- // only forward content of tags from text:-namespace
- if (completelyFiltered == 0 && nodeDepth > 0
- && textNodeStack.get(nodeDepth - 1)) {
- lazyEndSpan();
- super.characters(ch, start, length);
- }
- }
-
- // helper for checking tags which need complete filtering
- // (with sub-tags)
- private boolean needsCompleteFiltering(
- String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI)) {
- return localName.endsWith("-template")
- || localName.endsWith("-style");
- }
- return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
- }
-
- // map the heading level to <hX> HTML tags
- private String getXHTMLHeaderTagName(Attributes atts) {
- String depthStr = atts.getValue(TEXT_NS, "outline-level");
- if (depthStr == null) {
- return "h1";
- }
-
- int depth = Integer.parseInt(depthStr);
- if (depth >= 6) {
- return "h6";
- } else if (depth <= 1) {
- return "h1";
- } else {
- return "h" + depth;
- }
- }
-
- /**
- * Check if a node is a text node
- */
- private boolean isTextNode(String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
- return true;
- }
- if (SVG_NS.equals(namespaceURI)) {
- return "title".equals(localName) ||
- "desc".equals(localName);
- }
- return false;
- }
-
- private void startList(String name) throws SAXException {
- String elementName = "ul";
- if (name != null) {
- ListStyle style = listStyleMap.get(name);
- elementName = style != null ? style.getTag() : "ul";
- listStyleStack.push(style);
- }
- handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
- }
-
- private void endList() throws SAXException {
- String elementName = "ul";
- if (!listStyleStack.isEmpty()) {
- ListStyle style = listStyleStack.pop();
- elementName = style != null ? style.getTag() : "ul";
- }
- handler.endElement(XHTML, elementName, elementName);
- }
-
- private void startSpan(String name) throws SAXException {
- if (name == null) {
- return;
- }
-
- TextStyle style = textStyleMap.get(name);
- if (style == null) {
- return;
- }
-
- // End tags that refer to no longer valid styles
- if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
- handler.endElement(XHTML, "u", "u");
- }
- if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
- handler.endElement(XHTML, "i", "i");
- }
- if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
- handler.endElement(XHTML, "b", "b");
- }
-
- // Start tags for new styles
- if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
- handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
- }
- if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
- handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
- }
- if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
- handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
- }
-
- textStyle = style;
- lastTextStyle = null;
- }
-
- private void endSpan() throws SAXException {
- lastTextStyle = textStyle;
- textStyle = null;
- }
-
- private void lazyEndSpan() throws SAXException {
- if (lastTextStyle == null) {
- return;
- }
-
- if (lastTextStyle.underlined) {
- handler.endElement(XHTML, "u", "u");
- }
- if (lastTextStyle.italic) {
- handler.endElement(XHTML, "i", "i");
- }
- if (lastTextStyle.bold) {
- handler.endElement(XHTML, "b", "b");
- }
-
- lastTextStyle = null;
- }
-
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes attrs) throws SAXException {
- // keep track of current node type. If it is a text node,
- // a bit at the current depth its set in textNodeStack.
- // characters() checks the top bit to determine, if the
- // actual node is a text node to print out nodeDepth contains
- // the depth of the current node and also marks top of stack.
- assert nodeDepth >= 0;
-
- // Set styles
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- String family = attrs.getValue(STYLE_NS, "family");
- if ("text".equals(family)) {
- textStyle = new TextStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- textStyleMap.put(name, textStyle);
- }
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = new ListStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- listStyleMap.put(name, listStyle);
- } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
- && "text-properties".equals(localName)) {
- String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
- if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
- textStyle.italic = true;
- }
- String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
- if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
- || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
- && Integer.valueOf(fontWeight) > 500)) {
- textStyle.bold = true;
- }
- String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
- if (underlineStyle != null) {
- textStyle.underlined = true;
- }
- } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
- if ("list-level-style-bullet".equals(localName)) {
- listStyle.ordered = false;
- } else if ("list-level-style-number".equals(localName)) {
- listStyle.ordered = true;
- }
- }
-
- textNodeStack.set(nodeDepth++,
- isTextNode(namespaceURI, localName));
- // filter *all* content of some tags
- assert completelyFiltered >= 0;
-
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered++;
- }
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
- handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- startList(attrs.getValue(TEXT_NS, "style-name"));
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- startSpan(attrs.getValue(TEXT_NS, "style-name"));
- } else {
- super.startElement(namespaceURI, localName, qName, attrs);
- }
- }
- }
-
- @Override
- public void endElement(
- String namespaceURI, String localName, String qName)
- throws SAXException {
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- textStyle = null;
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = null;
- }
-
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.pop();
- handler.endElement(XHTMLContentHandler.XHTML, el, el);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- endList();
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- endSpan();
- } else {
- if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
- lazyEndSpan();
- }
- super.endElement(namespaceURI, localName, qName);
- }
-
- // special handling of tabulators
- if (TEXT_NS.equals(namespaceURI)
- && ("tab-stop".equals(localName)
- || "tab".equals(localName))) {
- this.characters(TAB, 0, TAB.length);
- }
- }
-
- // revert filter for *all* content of some tags
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered--;
- }
- assert completelyFiltered >= 0;
-
- // reduce current node depth
- nodeDepth--;
- assert nodeDepth >= 0;
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- // remove prefix mappings as they should not occur in XHTML
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- // remove prefix mappings as they should not occur in XHTML
- }
- }
-
- public static final String TEXT_NS =
- "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
-
- public static final String TABLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
-
- public static final String STYLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
-
- public static final String FORMATTING_OBJECTS_NS =
- "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
-
- public static final String OFFICE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
-
- public static final String SVG_NS =
- "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
-
- public static final String PRESENTATION_NS =
- "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
-
- public static final String DRAW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
-
- public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
-
- protected static final char[] TAB = new char[]{'\t'};
-
- private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
-
- /**
- * Mappings between ODF tag names and XHTML tag names
- * (including attributes). All other tag names/attributes are ignored
- * and left out from event stream.
- */
- private static final HashMap<QName, TargetElement> MAPPINGS =
- new HashMap<QName, TargetElement>();
-
- static {
- // general mappings of text:-tags
- MAPPINGS.put(
- new QName(TEXT_NS, "p"),
- new TargetElement(XHTML, "p"));
- // text:h-tags are mapped specifically in startElement/endElement
- MAPPINGS.put(
- new QName(TEXT_NS, "line-break"),
- new TargetElement(XHTML, "br"));
- MAPPINGS.put(
- new QName(TEXT_NS, "list-item"),
- new TargetElement(XHTML, "li"));
- MAPPINGS.put(
- new QName(TEXT_NS, "note"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(OFFICE_NS, "annotation"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(PRESENTATION_NS, "notes"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(DRAW_NS, "object"),
- new TargetElement(XHTML, "object"));
- MAPPINGS.put(
- new QName(DRAW_NS, "text-box"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(SVG_NS, "title"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(SVG_NS, "desc"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(TEXT_NS, "span"),
- new TargetElement(XHTML, "span"));
-
- final HashMap<QName, QName> aAttsMapping =
- new HashMap<QName, QName>();
- aAttsMapping.put(
- new QName(XLINK_NS, "href"),
- new QName("href"));
- aAttsMapping.put(
- new QName(XLINK_NS, "title"),
- new QName("title"));
- MAPPINGS.put(
- new QName(TEXT_NS, "a"),
- new TargetElement(XHTML, "a", aAttsMapping));
-
- // create HTML tables from table:-tags
- MAPPINGS.put(
- new QName(TABLE_NS, "table"),
- new TargetElement(XHTML, "table"));
- // repeating of rows is ignored; for columns, see below!
- MAPPINGS.put(
- new QName(TABLE_NS, "table-row"),
- new TargetElement(XHTML, "tr"));
- // special mapping for rowspan/colspan attributes
- final HashMap<QName, QName> tableCellAttsMapping =
- new HashMap<QName, QName>();
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-spanned"),
- new QName("colspan"));
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-rows-spanned"),
- new QName("rowspan"));
- /* TODO: The following is not correct, the cell should be repeated not spanned!
- * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
- * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
- * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
- * only for empty cells.
- */
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-repeated"),
- new QName("colspan"));
- MAPPINGS.put(
- new QName(TABLE_NS, "table-cell"),
- new TargetElement(XHTML, "td", tableCellAttsMapping));
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.emptySet(); // not a top-level parser
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- parseInternal(stream,
- new XHTMLContentHandler(handler, metadata),
- metadata, context);
- }
-
- void parseInternal(
- InputStream stream, final ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
-
-
- SAXParser parser = context.getSAXParser();
- parser.parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(
- new NSNormalizerContentHandler(dh)));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import javax.xml.namespace.QName;
+import javax.xml.parsers.SAXParser;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
+
+ private static final class OpenDocumentElementMappingContentHandler extends
+ ElementMappingContentHandler {
+ private final ContentHandler handler;
+ private final BitSet textNodeStack = new BitSet();
+ private int nodeDepth = 0;
+ private int completelyFiltered = 0;
+ private Stack<String> headingStack = new Stack<String>();
+ private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+ private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+ private TextStyle textStyle;
+ private TextStyle lastTextStyle;
+ private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+ private ListStyle listStyle;
+
+ private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+ Map<QName, TargetElement> mappings) {
+ super(handler, mappings);
+ this.handler = handler;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0
+ && textNodeStack.get(nodeDepth - 1)) {
+ lazyEndSpan();
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(
+ String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template")
+ || localName.endsWith("-style");
+ }
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+ }
+
+ // map the heading level to <hX> HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
+
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) ||
+ "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+
+ TextStyle style = textStyleMap.get(name);
+ if (style == null) {
+ return;
+ }
+
+ // End tags that refer to no longer valid styles
+ if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ // Start tags for new styles
+ if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ }
+ if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ }
+ if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ }
+
+ textStyle = style;
+ lastTextStyle = null;
+ }
+
+ private void endSpan() throws SAXException {
+ lastTextStyle = textStyle;
+ textStyle = null;
+ }
+
+ private void lazyEndSpan() throws SAXException {
+ if (lastTextStyle == null) {
+ return;
+ }
+
+ if (lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ lastTextStyle = null;
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
+
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ textStyle = new TextStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ textStyleMap.put(name, textStyle);
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ textStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ textStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null) {
+ textStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++,
+ isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(
+ String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ textStyle = null;
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ endSpan();
+ } else {
+ if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ lazyEndSpan();
+ }
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI)
+ && ("tab-stop".equals(localName)
+ || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+ }
+
+ public static final String TEXT_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+ public static final String TABLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+ public static final String STYLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+ public static final String FORMATTING_OBJECTS_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+ public static final String OFFICE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+ public static final String SVG_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+ public static final String PRESENTATION_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+ public static final String DRAW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+ public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB = new char[]{'\t'};
+
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ /**
+ * Mappings between ODF tag names and XHTML tag names
+ * (including attributes). All other tag names/attributes are ignored
+ * and left out from event stream.
+ */
+ private static final HashMap<QName, TargetElement> MAPPINGS =
+ new HashMap<QName, TargetElement>();
+
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(
+ new QName(TEXT_NS, "p"),
+ new TargetElement(XHTML, "p"));
+ // text:h-tags are mapped specifically in startElement/endElement
+ MAPPINGS.put(
+ new QName(TEXT_NS, "line-break"),
+ new TargetElement(XHTML, "br"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "list-item"),
+ new TargetElement(XHTML, "li"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "note"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(OFFICE_NS, "annotation"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(PRESENTATION_NS, "notes"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "object"),
+ new TargetElement(XHTML, "object"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "text-box"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "title"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "desc"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "span"),
+ new TargetElement(XHTML, "span"));
+
+ final HashMap<QName, QName> aAttsMapping =
+ new HashMap<QName, QName>();
+ aAttsMapping.put(
+ new QName(XLINK_NS, "href"),
+ new QName("href"));
+ aAttsMapping.put(
+ new QName(XLINK_NS, "title"),
+ new QName("title"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table"),
+ new TargetElement(XHTML, "table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-row"),
+ new TargetElement(XHTML, "tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap<QName, QName> tableCellAttsMapping =
+ new HashMap<QName, QName>();
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-spanned"),
+ new QName("colspan"));
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-rows-spanned"),
+ new QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be repeated not spanned!
+ * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+ * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-repeated"),
+ new QName("colspan"));
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-cell"),
+ new TargetElement(XHTML, "td", tableCellAttsMapping));
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parseInternal(stream,
+ new XHTMLContentHandler(handler, metadata),
+ metadata, context);
+ }
+
+ void parseInternal(
+ InputStream stream, final ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+
+ SAXParser parser = context.getSAXParser();
+ parser.parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ new NSNormalizerContentHandler(dh)));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 4713022..14b9674 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -1,199 +1,199 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.MSOffice;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
-import org.apache.tika.parser.xml.AttributeMetadataHandler;
-import org.apache.tika.parser.xml.ElementMetadataHandler;
-import org.apache.tika.parser.xml.MetadataHandler;
-import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.xpath.CompositeMatcher;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for OpenDocument <code>meta.xml</code> files.
- */
-public class OpenDocumentMetaParser extends XMLParser {
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -8739250869531737584L;
-
- private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
- private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
-
- /**
- * @see OfficeOpenXMLCore#SUBJECT
- * @deprecated use OfficeOpenXMLCore#SUBJECT
- */
- @Deprecated
- private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
- Property.composite(Office.INITIAL_AUTHOR,
- new Property[]{Property.externalText("initial-creator")});
-
- private static ContentHandler getDublinCoreHandler(
- Metadata metadata, Property property, String element) {
- return new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, element,
- metadata, property);
- }
-
- private static ContentHandler getMeta(
- ContentHandler ch, Metadata md, Property property, String element) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:" + element),
- META_XPATH.parse("//meta:" + element + "//text()"));
- ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, property), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- private static ContentHandler getUserDefined(
- ContentHandler ch, Metadata md) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:user-defined/@meta:name"),
- META_XPATH.parse("//meta:user-defined//text()"));
- // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
- ContentHandler branch = new MatchingContentHandler(
- new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
- matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- @Deprecated
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, String name, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
- ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, Property property, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
- ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
- // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
- // Process the Dublin Core Attributes
- ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
- getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
- getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
- getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
- getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
- getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
- getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
- getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
- getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
- getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
- getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
-
- // Process the OO Meta Attributes
- ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
- // ODF uses dc:date for modified
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "date",
- md, TikaCoreProperties.MODIFIED));
-
- // ODF uses dc:subject for description
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "subject",
- md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
- ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
-
- ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
- ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
- ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
- ch = getMeta(ch, md, Property.externalText("generator"), "generator");
-
- // Process the user defined Meta Attributes
- ch = getUserDefined(ch, md);
-
- // Process the OO Statistics Attributes
- ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
- ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
- ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
- ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
-
- // Legacy, Tika-1.0 style attributes
- // TODO Remove these in Tika 2.0
- ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
- ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
- ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
-
- // Legacy Statistics Attributes, replaced with real keys above
- // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
- ch = getStatistic(ch, md, "nbPage", "page-count");
- ch = getStatistic(ch, md, "nbPara", "paragraph-count");
- ch = getStatistic(ch, md, "nbWord", "word-count");
- ch = getStatistic(ch, md, "nbCharacter", "character-count");
- ch = getStatistic(ch, md, "nbTab", "table-count");
- ch = getStatistic(ch, md, "nbObject", "object-count");
- ch = getStatistic(ch, md, "nbImg", "image-count");
-
- // Normalise the rest
- ch = new NSNormalizerContentHandler(ch);
- return ch;
- }
-
- @Override
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- super.parse(stream, handler, metadata, context);
- // Copy subject to description for OO2
- String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
- if (odfSubject != null && !odfSubject.equals("") &&
- (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
- metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -8739250869531737584L;
+
+ private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+ private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+ /**
+ * @see OfficeOpenXMLCore#SUBJECT
+ * @deprecated use OfficeOpenXMLCore#SUBJECT
+ */
+ @Deprecated
+ private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+ Property.composite(Office.INITIAL_AUTHOR,
+ new Property[]{Property.externalText("initial-creator")});
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ private static ContentHandler getMeta(
+ ContentHandler ch, Metadata md, Property property, String element) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:" + element),
+ META_XPATH.parse("//meta:" + element + "//text()"));
+ ContentHandler branch =
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getUserDefined(
+ ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ @Deprecated
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, String name, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, Property property, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+ // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+ // Process the Dublin Core Attributes
+ ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+ getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+ // Process the OO Meta Attributes
+ ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+ // ODF uses dc:date for modified
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "date",
+ md, TikaCoreProperties.MODIFIED));
+
+ // ODF uses dc:subject for description
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "subject",
+ md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+ ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+ ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+ ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+ // Process the user defined Meta Attributes
+ ch = getUserDefined(ch, md);
+
+ // Process the OO Statistics Attributes
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+ // Legacy, Tika-1.0 style attributes
+ // TODO Remove these in Tika 2.0
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+ // Legacy Statistics Attributes, replaced with real keys above
+ // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+ ch = getStatistic(ch, md, "nbPage", "page-count");
+ ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+ ch = getStatistic(ch, md, "nbWord", "word-count");
+ ch = getStatistic(ch, md, "nbCharacter", "character-count");
+ ch = getStatistic(ch, md, "nbTab", "table-count");
+ ch = getStatistic(ch, md, "nbObject", "object-count");
+ ch = getStatistic(ch, md, "nbImg", "image-count");
+
+ // Normalise the rest
+ ch = new NSNormalizerContentHandler(ch);
+ return ch;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ super.parse(stream, handler, metadata, context);
+ // Copy subject to description for OO2
+ String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+ if (odfSubject != null && !odfSubject.equals("") &&
+ (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 2739340..00145d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -1,225 +1,225 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.EndDocumentShieldingContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * OpenOffice parser
- */
-public class OpenDocumentParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -6410276875438618287L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.sun.xml.writer"),
- MediaType.application("vnd.oasis.opendocument.text"),
- MediaType.application("vnd.oasis.opendocument.graphics"),
- MediaType.application("vnd.oasis.opendocument.presentation"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("vnd.oasis.opendocument.chart"),
- MediaType.application("vnd.oasis.opendocument.image"),
- MediaType.application("vnd.oasis.opendocument.formula"),
- MediaType.application("vnd.oasis.opendocument.text-master"),
- MediaType.application("vnd.oasis.opendocument.text-web"),
- MediaType.application("vnd.oasis.opendocument.text-template"),
- MediaType.application("vnd.oasis.opendocument.graphics-template"),
- MediaType.application("vnd.oasis.opendocument.presentation-template"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("vnd.oasis.opendocument.chart-template"),
- MediaType.application("vnd.oasis.opendocument.image-template"),
- MediaType.application("vnd.oasis.opendocument.formula-template"),
- MediaType.application("x-vnd.oasis.opendocument.text"),
- MediaType.application("x-vnd.oasis.opendocument.graphics"),
- MediaType.application("x-vnd.oasis.opendocument.presentation"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("x-vnd.oasis.opendocument.chart"),
- MediaType.application("x-vnd.oasis.opendocument.image"),
- MediaType.application("x-vnd.oasis.opendocument.formula"),
- MediaType.application("x-vnd.oasis.opendocument.text-master"),
- MediaType.application("x-vnd.oasis.opendocument.text-web"),
- MediaType.application("x-vnd.oasis.opendocument.text-template"),
- MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
- MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("x-vnd.oasis.opendocument.chart-template"),
- MediaType.application("x-vnd.oasis.opendocument.image-template"),
- MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
-
- private static final String META_NAME = "meta.xml";
-
- private Parser meta = new OpenDocumentMetaParser();
-
- private Parser content = new OpenDocumentContentParser();
-
- public Parser getMetaParser() {
- return meta;
- }
-
- public void setMetaParser(Parser meta) {
- this.meta = meta;
- }
-
- public Parser getContentParser() {
- return content;
- }
-
- public void setContentParser(Parser content) {
- this.content = content;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler baseHandler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- // Open the Zip stream
- // Use a File if we can, and an already open zip is even better
- ZipFile zipFile = null;
- ZipInputStream zipStream = null;
- if (stream instanceof TikaInputStream) {
- TikaInputStream tis = (TikaInputStream) stream;
- Object container = ((TikaInputStream) stream).getOpenContainer();
- if (container instanceof ZipFile) {
- zipFile = (ZipFile) container;
- } else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
- } else {
- zipStream = new ZipInputStream(stream);
- }
- } else {
- zipStream = new ZipInputStream(stream);
- }
-
- // Prepare to handle the content
- XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
-
- // As we don't know which of the metadata or the content
- // we'll hit first, catch the endDocument call initially
- EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(xhtml);
-
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
- }
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
- }
- }
-
- // Only now call the end document
- if (handler.getEndDocumentWasCalled()) {
- handler.reallyEndDocument();
- }
- }
-
- private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
- ZipEntry entry = zipStream.getNextEntry();
- while (entry != null) {
- handleZipEntry(entry, zipStream, metadata, context, handler);
- entry = zipStream.getNextEntry();
- }
- }
-
- private void handleZipFile(ZipFile zipFile, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, TikaException, SAXException {
- // If we can, process the metadata first, then the
- // rest of the file afterwards (TIKA-1353)
- // Only possible to guarantee that when opened from a file not a stream
-
- ZipEntry entry = zipFile.getEntry(META_NAME);
- if (entry != null) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
- }
-
- Enumeration<? extends ZipEntry> entries = zipFile.entries();
- while (entries.hasMoreElements()) {
- entry = entries.nextElement();
- if (!META_NAME.equals(entry.getName())) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
- }
- }
- }
- private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, SAXException, TikaException {
- if (entry == null) return;
-
- if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, UTF_8);
- metadata.set(Metadata.CONTENT_TYPE, type);
- } else if (entry.getName().equals(META_NAME)) {
- meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith("content.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
- }
- } else if (entry.getName().endsWith("styles.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
- }
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6410276875438618287L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+ MediaType.application("vnd.oasis.opendocument.presentation"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+ MediaType.application("vnd.oasis.opendocument.text-template"),
+ MediaType.application("vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("vnd.oasis.opendocument.chart-template"),
+ MediaType.application("vnd.oasis.opendocument.image-template"),
+ MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+ MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+ MediaType.application("x-vnd.oasis.opendocument.text-template"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+ MediaType.application("x-vnd.oasis.opendocument.image-template"),
+ MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+ private static final String META_NAME = "meta.xml";
+
+ private Parser meta = new OpenDocumentMetaParser();
+
+ private Parser content = new OpenDocumentContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // Prepare to handle the content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+ // As we don't know which of the metadata or the content
+ // we'll hit first, catch the endDocument call initially
+ EndDocumentShieldingContentHandler handler =
+ new EndDocumentShieldingContentHandler(xhtml);
+
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
+ }
+
+ // Only now call the end document
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+ ZipEntry entry = zipStream.getNextEntry();
+ while (entry != null) {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ entry = zipStream.getNextEntry();
+ }
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, TikaException, SAXException {
+ // If we can, process the metadata first, then the
+ // rest of the file afterwards (TIKA-1353)
+ // Only possible to guarantee that when opened from a file not a stream
+
+ ZipEntry entry = zipFile.getEntry(META_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (!META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+ }
+ }
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ if (entry == null) return;
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals(META_NAME)) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith("content.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ }
+ }
+}
[03/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
index 45f0388..da046aa 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.Locale;
-
-/**
- * Alternative HTML mapping rules that pass the input HTML as-is without any
- * modifications.
- *
- * @since Apache Tika 0.8
- */
-public class IdentityHtmlMapper implements HtmlMapper {
-
- public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
-
- public boolean isDiscardElement(String name) {
- return false;
- }
-
- public String mapSafeAttribute(String elementName, String attributeName) {
- return attributeName.toLowerCase(Locale.ENGLISH);
- }
-
- public String mapSafeElement(String name) {
- return name.toLowerCase(Locale.ENGLISH);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.Locale;
+
+/**
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
+ * @since Apache Tika 0.8
+ */
+public class IdentityHtmlMapper implements HtmlMapper {
+
+ public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
+
+ public boolean isDiscardElement(String name) {
+ return false;
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return attributeName.toLowerCase(Locale.ENGLISH);
+ }
+
+ public String mapSafeElement(String name) {
+ return name.toLowerCase(Locale.ENGLISH);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
index 336ae75..221a87a 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import javax.xml.XMLConstants;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that downgrades XHTML elements to
- * old-style HTML elements before passing them on to the decorated
- * content handler. This downgrading consists of dropping all namespaces
- * (and namespaced attributes) and uppercasing all element names.
- * Used by the {@link HtmlParser} to make all incoming HTML look the same.
- */
-class XHTMLDowngradeHandler extends ContentHandlerDecorator {
-
- public XHTMLDowngradeHandler(ContentHandler handler) {
- super(handler);
- }
-
- @Override
- public void startElement(
- String uri, String localName, String name, Attributes atts)
- throws SAXException {
- String upper = localName.toUpperCase(Locale.ENGLISH);
-
- AttributesImpl attributes = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- String auri = atts.getURI(i);
- String local = atts.getLocalName(i);
- String qname = atts.getQName(i);
- if (XMLConstants.NULL_NS_URI.equals(auri)
- && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
- && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
- attributes.addAttribute(
- auri, local, qname, atts.getType(i), atts.getValue(i));
- }
- }
-
- super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
- }
-
- @Override
- public void endElement(String uri, String localName, String name)
- throws SAXException {
- String upper = localName.toUpperCase(Locale.ENGLISH);
- super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+ public XHTMLDowngradeHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+
+ AttributesImpl attributes = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ String auri = atts.getURI(i);
+ String local = atts.getLocalName(i);
+ String qname = atts.getQName(i);
+ if (XMLConstants.NULL_NS_URI.equals(auri)
+ && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+ && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+ attributes.addAttribute(
+ auri, local, qname, atts.getType(i), atts.getValue(i));
+ }
+ }
+
+ super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+ super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 9740eff..2c8942e 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -1,376 +1,376 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.codec.DecodeMonitor;
-import org.apache.james.mime4j.codec.DecoderUtil;
-import org.apache.james.mime4j.dom.address.Address;
-import org.apache.james.mime4j.dom.address.AddressList;
-import org.apache.james.mime4j.dom.address.Mailbox;
-import org.apache.james.mime4j.dom.address.MailboxList;
-import org.apache.james.mime4j.dom.field.AddressListField;
-import org.apache.james.mime4j.dom.field.DateTimeField;
-import org.apache.james.mime4j.dom.field.MailboxListField;
-import org.apache.james.mime4j.dom.field.ParsedField;
-import org.apache.james.mime4j.dom.field.UnstructuredField;
-import org.apache.james.mime4j.field.LenientFieldParser;
-import org.apache.james.mime4j.parser.ContentHandler;
-import org.apache.james.mime4j.stream.BodyDescriptor;
-import org.apache.james.mime4j.stream.Field;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
-/**
- * Bridge between mime4j's content handler and the generic Sax content handler
- * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
- */
-class MailContentHandler implements ContentHandler {
-
- //TIKA-1970 Mac Mail's format
- private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
- Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
-
- //find a time ending in am/pm without a space: 10:30am and
- //use this pattern to insert space: 10:30 am
- private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
-
- private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
- //note that the string is "cleaned" before processing:
- //1) condense multiple whitespace to single space
- //2) trim()
- //3) strip out commas
- //4) insert space before am/pm
-
- //May 16 2016 1:32am
- createDateFormat("MMM dd yy hh:mm a", null),
-
- //this is a standard pattern handled by mime4j;
- //but mime4j fails with leading whitespace
- createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
-
- createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
-
- createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
-
- createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
-
- //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
- createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
-
- createDateFormat("yy-MM-dd HH:mm:ss", null),
-
- createDateFormat("MM/dd/yy hh:mm a", null, false),
-
- //now dates without times
- createDateFormat("MMM d yy", MIDDAY, false),
- createDateFormat("EEE d MMM yy", MIDDAY, false),
- createDateFormat("d MMM yy", MIDDAY, false),
- createDateFormat("yy/MM/dd", MIDDAY, false),
- createDateFormat("MM/dd/yy", MIDDAY, false)
- };
-
- private static DateFormat createDateFormat(String format, TimeZone timezone) {
- return createDateFormat(format, timezone, true);
- }
-
- private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
- SimpleDateFormat sdf =
- new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
- if (timezone != null) {
- sdf.setTimeZone(timezone);
- }
- sdf.setLenient(isLenient);
- return sdf;
- }
-
- private boolean strictParsing = false;
-
- private XHTMLContentHandler handler;
- private Metadata metadata;
- private EmbeddedDocumentExtractor extractor;
-
- private boolean inPart = false;
-
- MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
- this.handler = xhtml;
- this.metadata = metadata;
- this.strictParsing = strictParsing;
-
- // Fetch / Build an EmbeddedDocumentExtractor with which
- // to handle/process the parts/attachments
-
- // Was an EmbeddedDocumentExtractor explicitly supplied?
- this.extractor = context.get(EmbeddedDocumentExtractor.class);
-
- // If there's no EmbeddedDocumentExtractor, then try using a normal parser
- // This will ensure that the contents are made available to the user, so
- // the see the text, but without fine-grained control/extraction
- // (This also maintains backward compatibility with older versions!)
- if (this.extractor == null) {
- // If the user gave a parser, use that, if not the default
- Parser parser = context.get(AutoDetectParser.class);
- if (parser == null) {
- parser = context.get(Parser.class);
- }
- if (parser == null) {
- TikaConfig tikaConfig = context.get(TikaConfig.class);
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- parser = new AutoDetectParser(tikaConfig.getParser());
- }
- ParseContext ctx = new ParseContext();
- ctx.set(Parser.class, parser);
- extractor = new ParsingEmbeddedDocumentExtractor(ctx);
- }
- }
-
- public void body(BodyDescriptor body, InputStream is) throws MimeException,
- IOException {
- // use a different metadata object
- // in order to specify the mime type of the
- // sub part without damaging the main metadata
-
- Metadata submd = new Metadata();
- submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
- submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
-
- try {
- if (extractor.shouldParseEmbedded(submd)) {
- extractor.parseEmbedded(is, handler, submd, false);
- }
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endBodyPart() throws MimeException {
- try {
- handler.endElement("p");
- handler.endElement("div");
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endHeader() throws MimeException {
- }
-
- public void startMessage() throws MimeException {
- try {
- handler.startDocument();
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endMessage() throws MimeException {
- try {
- handler.endDocument();
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endMultipart() throws MimeException {
- inPart = false;
- }
-
- public void epilogue(InputStream is) throws MimeException, IOException {
- }
-
- /**
- * Header for the whole message or its parts
- *
- * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
- * Field.html
- */
- public void field(Field field) throws MimeException {
- // inPart indicates whether these metadata correspond to the
- // whole message or its parts
- if (inPart) {
- return;
- }
-
- try {
- String fieldname = field.getName();
- ParsedField parsedField = LenientFieldParser.getParser().parse(
- field, DecodeMonitor.SILENT);
- if (fieldname.equalsIgnoreCase("From")) {
- MailboxListField fromField = (MailboxListField) parsedField;
- MailboxList mailboxList = fromField.getMailboxList();
- if (fromField.isValidField() && mailboxList != null) {
- for (Address address : mailboxList) {
- String from = getDisplayString(address);
- metadata.add(Metadata.MESSAGE_FROM, from);
- metadata.add(TikaCoreProperties.CREATOR, from);
- }
- } else {
- String from = stripOutFieldPrefix(field, "From:");
- if (from.startsWith("<")) {
- from = from.substring(1);
- }
- if (from.endsWith(">")) {
- from = from.substring(0, from.length() - 1);
- }
- metadata.add(Metadata.MESSAGE_FROM, from);
- metadata.add(TikaCoreProperties.CREATOR, from);
- }
- } else if (fieldname.equalsIgnoreCase("Subject")) {
- metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
- ((UnstructuredField) parsedField).getValue());
- } else if (fieldname.equalsIgnoreCase("To")) {
- processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
- } else if (fieldname.equalsIgnoreCase("CC")) {
- processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
- } else if (fieldname.equalsIgnoreCase("BCC")) {
- processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
- } else if (fieldname.equalsIgnoreCase("Date")) {
- DateTimeField dateField = (DateTimeField) parsedField;
- Date date = dateField.getDate();
- if (date == null) {
- date = tryOtherDateFormats(field.getBody());
- }
- metadata.set(TikaCoreProperties.CREATED, date);
- }
- } catch (RuntimeException me) {
- if (strictParsing) {
- throw me;
- }
- }
- }
-
- private static synchronized Date tryOtherDateFormats(String text) {
- if (text == null) {
- return null;
- }
- //strip out additional spaces and trim
- text = text.replaceAll("\\s+", " ").trim();
-
- //strip out commas
- text = text.replaceAll(",", "");
- Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("GMT$1$2:00");
- }
-
- matcher = AM_PM.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("$1 $2");
- }
-
- for (DateFormat format : ALTERNATE_DATE_FORMATS) {
- try {
- return format.parse(text);
- } catch (ParseException e) {
- }
- }
- return null;
- }
-
- private void processAddressList(ParsedField field, String addressListType,
- String metadataField) throws MimeException {
- AddressListField toField = (AddressListField) field;
- if (toField.isValidField()) {
- AddressList addressList = toField.getAddressList();
- for (Address address : addressList) {
- metadata.add(metadataField, getDisplayString(address));
- }
- } else {
- String to = stripOutFieldPrefix(field,
- addressListType);
- for (String eachTo : to.split(",")) {
- metadata.add(metadataField, eachTo.trim());
- }
- }
- }
-
- private String getDisplayString(Address address) {
- if (address instanceof Mailbox) {
- Mailbox mailbox = (Mailbox) address;
- String name = mailbox.getName();
- if (name != null && name.length() > 0) {
- name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
- return name + " <" + mailbox.getAddress() + ">";
- } else {
- return mailbox.getAddress();
- }
- } else {
- return address.toString();
- }
- }
-
- public void preamble(InputStream is) throws MimeException, IOException {
- }
-
- public void raw(InputStream is) throws MimeException, IOException {
- }
-
- public void startBodyPart() throws MimeException {
- try {
- handler.startElement("div", "class", "email-entry");
- handler.startElement("p");
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void startHeader() throws MimeException {
- // TODO Auto-generated method stub
-
- }
-
- public void startMultipart(BodyDescriptor descr) throws MimeException {
- inPart = true;
- }
-
- private String stripOutFieldPrefix(Field field, String fieldname) {
- String temp = field.getRaw().toString();
- int loc = fieldname.length();
- while (temp.charAt(loc) == ' ') {
- loc++;
- }
- return temp.substring(loc);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.parser.ContentHandler;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
+ */
+class MailContentHandler implements ContentHandler {
+
+ //TIKA-1970 Mac Mail's format
+ private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
+ Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+
+ //find a time ending in am/pm without a space: 10:30am and
+ //use this pattern to insert space: 10:30 am
+ private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
+ private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
+ //note that the string is "cleaned" before processing:
+ //1) condense multiple whitespace to single space
+ //2) trim()
+ //3) strip out commas
+ //4) insert space before am/pm
+
+ //May 16 2016 1:32am
+ createDateFormat("MMM dd yy hh:mm a", null),
+
+ //this is a standard pattern handled by mime4j;
+ //but mime4j fails with leading whitespace
+ createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+
+ createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+
+ createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+
+ createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
+
+ //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
+ createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
+
+ createDateFormat("yy-MM-dd HH:mm:ss", null),
+
+ createDateFormat("MM/dd/yy hh:mm a", null, false),
+
+ //now dates without times
+ createDateFormat("MMM d yy", MIDDAY, false),
+ createDateFormat("EEE d MMM yy", MIDDAY, false),
+ createDateFormat("d MMM yy", MIDDAY, false),
+ createDateFormat("yy/MM/dd", MIDDAY, false),
+ createDateFormat("MM/dd/yy", MIDDAY, false)
+ };
+
+ private static DateFormat createDateFormat(String format, TimeZone timezone) {
+ return createDateFormat(format, timezone, true);
+ }
+
+ private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
+ SimpleDateFormat sdf =
+ new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+ if (timezone != null) {
+ sdf.setTimeZone(timezone);
+ }
+ sdf.setLenient(isLenient);
+ return sdf;
+ }
+
+ private boolean strictParsing = false;
+
+ private XHTMLContentHandler handler;
+ private Metadata metadata;
+ private EmbeddedDocumentExtractor extractor;
+
+ private boolean inPart = false;
+
+ MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
+ this.handler = xhtml;
+ this.metadata = metadata;
+ this.strictParsing = strictParsing;
+
+ // Fetch / Build an EmbeddedDocumentExtractor with which
+ // to handle/process the parts/attachments
+
+ // Was an EmbeddedDocumentExtractor explicitly supplied?
+ this.extractor = context.get(EmbeddedDocumentExtractor.class);
+
+ // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+ // This will ensure that the contents are made available to the user, so
+ // the see the text, but without fine-grained control/extraction
+ // (This also maintains backward compatibility with older versions!)
+ if (this.extractor == null) {
+ // If the user gave a parser, use that, if not the default
+ Parser parser = context.get(AutoDetectParser.class);
+ if (parser == null) {
+ parser = context.get(Parser.class);
+ }
+ if (parser == null) {
+ TikaConfig tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ parser = new AutoDetectParser(tikaConfig.getParser());
+ }
+ ParseContext ctx = new ParseContext();
+ ctx.set(Parser.class, parser);
+ extractor = new ParsingEmbeddedDocumentExtractor(ctx);
+ }
+ }
+
+ public void body(BodyDescriptor body, InputStream is) throws MimeException,
+ IOException {
+ // use a different metadata object
+ // in order to specify the mime type of the
+ // sub part without damaging the main metadata
+
+ Metadata submd = new Metadata();
+ submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+ submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+ try {
+ if (extractor.shouldParseEmbedded(submd)) {
+ extractor.parseEmbedded(is, handler, submd, false);
+ }
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endBodyPart() throws MimeException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endHeader() throws MimeException {
+ }
+
+ public void startMessage() throws MimeException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMessage() throws MimeException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMultipart() throws MimeException {
+ inPart = false;
+ }
+
+ public void epilogue(InputStream is) throws MimeException, IOException {
+ }
+
+ /**
+ * Header for the whole message or its parts
+ *
+ * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
+ * Field.html
+ */
+ public void field(Field field) throws MimeException {
+ // inPart indicates whether these metadata correspond to the
+ // whole message or its parts
+ if (inPart) {
+ return;
+ }
+
+ try {
+ String fieldname = field.getName();
+ ParsedField parsedField = LenientFieldParser.getParser().parse(
+ field, DecodeMonitor.SILENT);
+ if (fieldname.equalsIgnoreCase("From")) {
+ MailboxListField fromField = (MailboxListField) parsedField;
+ MailboxList mailboxList = fromField.getMailboxList();
+ if (fromField.isValidField() && mailboxList != null) {
+ for (Address address : mailboxList) {
+ String from = getDisplayString(address);
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else {
+ String from = stripOutFieldPrefix(field, "From:");
+ if (from.startsWith("<")) {
+ from = from.substring(1);
+ }
+ if (from.endsWith(">")) {
+ from = from.substring(0, from.length() - 1);
+ }
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else if (fieldname.equalsIgnoreCase("Subject")) {
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
+ ((UnstructuredField) parsedField).getValue());
+ } else if (fieldname.equalsIgnoreCase("To")) {
+ processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+ } else if (fieldname.equalsIgnoreCase("CC")) {
+ processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+ } else if (fieldname.equalsIgnoreCase("BCC")) {
+ processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+ } else if (fieldname.equalsIgnoreCase("Date")) {
+ DateTimeField dateField = (DateTimeField) parsedField;
+ Date date = dateField.getDate();
+ if (date == null) {
+ date = tryOtherDateFormats(field.getBody());
+ }
+ metadata.set(TikaCoreProperties.CREATED, date);
+ }
+ } catch (RuntimeException me) {
+ if (strictParsing) {
+ throw me;
+ }
+ }
+ }
+
+ private static synchronized Date tryOtherDateFormats(String text) {
+ if (text == null) {
+ return null;
+ }
+ //strip out additional spaces and trim
+ text = text.replaceAll("\\s+", " ").trim();
+
+ //strip out commas
+ text = text.replaceAll(",", "");
+ Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceFirst("GMT$1$2:00");
+ }
+
+ matcher = AM_PM.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceFirst("$1 $2");
+ }
+
+ for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+ try {
+ return format.parse(text);
+ } catch (ParseException e) {
+ }
+ }
+ return null;
+ }
+
+ private void processAddressList(ParsedField field, String addressListType,
+ String metadataField) throws MimeException {
+ AddressListField toField = (AddressListField) field;
+ if (toField.isValidField()) {
+ AddressList addressList = toField.getAddressList();
+ for (Address address : addressList) {
+ metadata.add(metadataField, getDisplayString(address));
+ }
+ } else {
+ String to = stripOutFieldPrefix(field,
+ addressListType);
+ for (String eachTo : to.split(",")) {
+ metadata.add(metadataField, eachTo.trim());
+ }
+ }
+ }
+
+ private String getDisplayString(Address address) {
+ if (address instanceof Mailbox) {
+ Mailbox mailbox = (Mailbox) address;
+ String name = mailbox.getName();
+ if (name != null && name.length() > 0) {
+ name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+ return name + " <" + mailbox.getAddress() + ">";
+ } else {
+ return mailbox.getAddress();
+ }
+ } else {
+ return address.toString();
+ }
+ }
+
+ public void preamble(InputStream is) throws MimeException, IOException {
+ }
+
+ public void raw(InputStream is) throws MimeException, IOException {
+ }
+
+ public void startBodyPart() throws MimeException {
+ try {
+ handler.startElement("div", "class", "email-entry");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void startHeader() throws MimeException {
+ // TODO Auto-generated method stub
+
+ }
+
+ public void startMultipart(BodyDescriptor descr) throws MimeException {
+ inPart = true;
+ }
+
+ private String stripOutFieldPrefix(Field field, String fieldname) {
+ String temp = field.getRaw().toString();
+ int loc = fieldname.length();
+ while (temp.charAt(loc) == ' ') {
+ loc++;
+ }
+ return temp.substring(loc);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index 9ac02a7..6299d3f 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -1,95 +1,95 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.parser.MimeStreamParser;
-import org.apache.james.mime4j.stream.MimeConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within elements.
- * <p/>
- * A {@link MimeEntityConfig} object can be passed in the parsing context
- * to better control the parsing process.
- *
- * @author jnioche@digitalpebble.com
- */
-public class RFC822Parser extends AbstractParser {
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -5504243905998074168L;
-
- private static final Set<MediaType> SUPPORTED_TYPES = Collections
- .singleton(MediaType.parse("message/rfc822"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- // Get the mime4j configuration, or use a default one
- MimeConfig config = new MimeConfig();
- config.setMaxLineLen(100000);
- config.setMaxHeaderLen(100000); // max length of any individual header
- config = context.get(MimeConfig.class, config);
-
- MimeStreamParser parser = new MimeStreamParser(config);
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
- MailContentHandler mch = new MailContentHandler(
- xhtml, metadata, context, config.isStrictParsing());
- parser.setContentHandler(mch);
- parser.setContentDecoding(true);
-
- TikaInputStream tstream = TikaInputStream.get(stream);
- try {
- parser.parse(tstream);
- } catch (IOException e) {
- tstream.throwIfCauseOf(e);
- throw new TikaException("Failed to parse an email message", e);
- } catch (MimeException e) {
- // Unwrap the exception in case it was not thrown by mime4j
- Throwable cause = e.getCause();
- if (cause instanceof TikaException) {
- throw (TikaException) cause;
- } else if (cause instanceof SAXException) {
- throw (SAXException) cause;
- } else {
- throw new TikaException("Failed to parse an email message", e);
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within elements.
+ * <p/>
+ * A {@link MimeEntityConfig} object can be passed in the parsing context
+ * to better control the parsing process.
+ *
+ * @author jnioche@digitalpebble.com
+ */
+public class RFC822Parser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -5504243905998074168L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.parse("message/rfc822"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ // Get the mime4j configuration, or use a default one
+ MimeConfig config = new MimeConfig();
+ config.setMaxLineLen(100000);
+ config.setMaxHeaderLen(100000); // max length of any individual header
+ config = context.get(MimeConfig.class, config);
+
+ MimeStreamParser parser = new MimeStreamParser(config);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ MailContentHandler mch = new MailContentHandler(
+ xhtml, metadata, context, config.isStrictParsing());
+ parser.setContentHandler(mch);
+ parser.setContentDecoding(true);
+
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ try {
+ parser.parse(tstream);
+ } catch (IOException e) {
+ tstream.throwIfCauseOf(e);
+ throw new TikaException("Failed to parse an email message", e);
+ } catch (MimeException e) {
+ // Unwrap the exception in case it was not thrown by mime4j
+ Throwable cause = e.getCause();
+ if (cause instanceof TikaException) {
+ throw (TikaException) cause;
+ } else if (cause instanceof SAXException) {
+ throw (SAXException) cause;
+ } else {
+ throw new TikaException("Failed to parse an email message", e);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
index 5be4b0b..cc10dd2 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
@@ -1,75 +1,75 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.feed;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class FeedParserTest {
- @Test
- public void testRSSParser() throws Exception {
- try (InputStream input = FeedParserTest.class.getResourceAsStream(
- "/test-documents/rsstest.rss")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
-
- new FeedParser().parse(input, handler, metadata, context);
-
- String content = handler.toString();
- assertFalse(content == null);
-
- assertEquals("Sample RSS File for Junit test",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
-
- // TODO find a way of testing the paragraphs and anchors
- }
- }
-
-
- @Test
- public void testAtomParser() throws Exception {
- try (InputStream input = FeedParserTest.class.getResourceAsStream(
- "/test-documents/testATOM.atom")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
-
- new FeedParser().parse(input, handler, metadata, context);
-
- String content = handler.toString();
- assertFalse(content == null);
-
- assertEquals("Sample Atom File for Junit test",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
-
- // TODO Check some more
- }
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest {
+ @Test
+ public void testRSSParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/rsstest.rss")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample RSS File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO find a way of testing the paragraphs and anchors
+ }
+ }
+
+
+ @Test
+ public void testAtomParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/testATOM.atom")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample Atom File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO Check some more
+ }
+ }
+
+}
[16/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 3d28b35..25e567f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -1,382 +1,382 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.util.List;
-
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.junit.Test;
-
-/**
- * Tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
-
- /**
- * For office files which don't have anything embedded in them
- */
- @Test
- public void testWithoutEmbedded() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
-
- String[] files = new String[]{
- "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
- "testVISIO.vsd", "test-outlook.msg"
- };
- for (String file : files) {
- // Process it without recursing
- TrackingHandler handler = process(file, extractor, false);
-
- // Won't have fired
- assertEquals(0, handler.filenames.size());
- assertEquals(0, handler.mediaTypes.size());
-
- // Ditto with recursing
- handler = process(file, extractor, true);
- assertEquals(0, handler.filenames.size());
- assertEquals(0, handler.mediaTypes.size());
- }
- }
-
- /**
- * Office files with embedded images, but no other
- * office files in them
- */
- @Test
- public void testEmbeddedImages() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- // Excel with 1 image
- handler = process("testEXCEL_1img.xls", extractor, false);
- assertEquals(1, handler.filenames.size());
- assertEquals(1, handler.mediaTypes.size());
-
- assertEquals(null, handler.filenames.get(0));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
- // PowerPoint with 2 images + sound
- // TODO
-
-
- // Word with 1 image
- handler = process("testWORD_1img.doc", extractor, false);
- assertEquals(1, handler.filenames.size());
- assertEquals(1, handler.mediaTypes.size());
-
- assertEquals("image1.png", handler.filenames.get(0));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
- // Word with 3 images
- handler = process("testWORD_3imgs.doc", extractor, false);
- assertEquals(3, handler.filenames.size());
- assertEquals(3, handler.mediaTypes.size());
-
- assertEquals("image1.png", handler.filenames.get(0));
- assertEquals("image2.jpg", handler.filenames.get(1));
- assertEquals("image3.png", handler.filenames.get(2));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
- assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
- }
-
- /**
- * Office files which have other office files
- * embedded into them. The embedded office files
- * will sometimes have images in them.
- * <p/>
- * eg xls
- * -> word
- * -> image
- * -> image
- * -> powerpoint
- * -> excel
- * -> image
- */
- @Test
- public void testEmbeddedOfficeFiles() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
-
- // Excel with a word doc and a powerpoint doc, both of which have images in them
- // Without recursion, should see both documents + the images
- handler = process("testEXCEL_embeded.xls", extractor, false);
- assertEquals(5, handler.filenames.size());
- assertEquals(5, handler.mediaTypes.size());
-
- // We don't know their filenames
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
- assertEquals("MBD00032A24.doc", handler.filenames.get(4));
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
-
-
- // With recursion, should get the images embedded in the office files too
- handler = process("testEXCEL_embeded.xls", extractor, true);
- assertEquals(17, handler.filenames.size());
- assertEquals(17, handler.mediaTypes.size());
-
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
- assertEquals("1", handler.filenames.get(4));
- assertEquals(null, handler.filenames.get(5));
- assertEquals("2", handler.filenames.get(6));
- assertEquals("image1.png", handler.filenames.get(7));
- assertEquals("image2.jpg", handler.filenames.get(8));
- assertEquals("image3.png", handler.filenames.get(9));
- assertEquals("image1.png", handler.filenames.get(16));
-
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
- assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
- assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
- assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
- assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
-
- // Word with .docx, powerpoint and excel
- handler = process("testWORD_embeded.doc", extractor, false);
- assertEquals(9, handler.filenames.size());
- assertEquals(9, handler.mediaTypes.size());
-
- // Filenames are a bit iffy...
- // Should really be 3*embedded pictures then 3*icons then embedded docs
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals("_1345471035.ppt", handler.filenames.get(7));
- assertEquals("_1345470949.xls", handler.filenames.get(8));
-
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
- assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
- assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
- assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
- assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
- assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
-
-
- // With recursion, should get their images too
- handler = process("testWORD_embeded.doc", extractor, true);
- assertEquals(16, handler.filenames.size());
- assertEquals(16, handler.mediaTypes.size());
-
- // We don't know their filenames, except for doc images + docx
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals("image2.png", handler.filenames.get(7));
- assertEquals("image3.jpeg", handler.filenames.get(8));
- assertEquals("image4.png", handler.filenames.get(9));
- for (int i = 11; i < 14; i++) {
- assertNull(handler.filenames.get(i));
- }
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
- assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
- assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx
- assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx
- assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
- assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls
-
-
- // PowerPoint with excel and word
- handler = process("testPPT_embeded.ppt", extractor, false);
- assertEquals(7, handler.filenames.size());
- assertEquals(7, handler.mediaTypes.size());
-
- // We don't get all that helpful filenames
- assertEquals("1", handler.filenames.get(0));
- assertEquals("2", handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
- assertEquals(null, handler.filenames.get(4));
- assertEquals(null, handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- // But we do know their types
- assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
-
- // Run again on PowerPoint but with recursion
- handler = process("testPPT_embeded.ppt", extractor, true);
- assertEquals(11, handler.filenames.size());
- assertEquals(11, handler.mediaTypes.size());
-
- assertEquals("1", handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals("2", handler.filenames.get(2));
- assertEquals("image1.png", handler.filenames.get(3));
- assertEquals("image2.jpg", handler.filenames.get(4));
- assertEquals("image3.png", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals(null, handler.filenames.get(7));
- assertEquals(null, handler.filenames.get(8));
- assertEquals(null, handler.filenames.get(9));
- assertEquals(null, handler.filenames.get(10));
-
- assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .xls
- assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .docx
- assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside .docx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .docx
- assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
-
-
- // Word, with a non-office file (PDF)
- handler = process("testWORD_embedded_pdf.doc", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("_1402837031.pdf", handler.filenames.get(1));
-
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
- assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
-
-
- // Outlook with a text file and a word document
- handler = process("testMSG_att_doc.msg", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("test-unicode.doc", handler.filenames.get(0));
- assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
-
- assertEquals("pj1.txt", handler.filenames.get(1));
- assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
-
-
- // Outlook with a pdf and another outlook message
- handler = process("testMSG_att_msg.msg", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
- assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
-
- assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
- assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
- }
-
- @Test
- public void testEmbeddedOfficeFilesXML() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- handler = process("EmbeddedDocument.docx", extractor, false);
- assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
- assertEquals(2, handler.filenames.size());
- }
-
- @Test
- public void testPowerpointImages() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- handler = process("pictures.ppt", extractor, false);
- assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
- assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
- }
-
- @Test
- public void testEmbeddedStorageId() throws Exception {
-
- List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
- //.docx
- assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
- list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
- //_1345471035.ppt
- assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
- list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
- //_1345470949.xls
- assertEquals("{00020820-0000-0000-C000-000000000046}",
- list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-
- }
-
- @Test
- public void testEmbeddedGraphChart() throws Exception {
- //doc converts a chart to a actual xls file
- //so we only need to look in ppt and xls
- for (String suffix : new String[]{"ppt", "xls"}) {
- List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
- boolean found = false;
- for (Metadata m : list) {
- if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
- found = true;
- }
- assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
- }
- assertTrue("didn't find chart in "+suffix, found);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+/**
+ * Tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
+
+ /**
+ * For office files which don't have anything embedded in them
+ */
+ @Test
+ public void testWithoutEmbedded() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+
+ String[] files = new String[]{
+ "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
+ "testVISIO.vsd", "test-outlook.msg"
+ };
+ for (String file : files) {
+ // Process it without recursing
+ TrackingHandler handler = process(file, extractor, false);
+
+ // Won't have fired
+ assertEquals(0, handler.filenames.size());
+ assertEquals(0, handler.mediaTypes.size());
+
+ // Ditto with recursing
+ handler = process(file, extractor, true);
+ assertEquals(0, handler.filenames.size());
+ assertEquals(0, handler.mediaTypes.size());
+ }
+ }
+
+ /**
+ * Office files with embedded images, but no other
+ * office files in them
+ */
+ @Test
+ public void testEmbeddedImages() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TrackingHandler handler;
+
+ // Excel with 1 image
+ handler = process("testEXCEL_1img.xls", extractor, false);
+ assertEquals(1, handler.filenames.size());
+ assertEquals(1, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+ // PowerPoint with 2 images + sound
+ // TODO
+
+
+ // Word with 1 image
+ handler = process("testWORD_1img.doc", extractor, false);
+ assertEquals(1, handler.filenames.size());
+ assertEquals(1, handler.mediaTypes.size());
+
+ assertEquals("image1.png", handler.filenames.get(0));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+ // Word with 3 images
+ handler = process("testWORD_3imgs.doc", extractor, false);
+ assertEquals(3, handler.filenames.size());
+ assertEquals(3, handler.mediaTypes.size());
+
+ assertEquals("image1.png", handler.filenames.get(0));
+ assertEquals("image2.jpg", handler.filenames.get(1));
+ assertEquals("image3.png", handler.filenames.get(2));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+ }
+
+ /**
+ * Office files which have other office files
+ * embedded into them. The embedded office files
+ * will sometimes have images in them.
+ * <p/>
+ * eg xls
+ * -> word
+ * -> image
+ * -> image
+ * -> powerpoint
+ * -> excel
+ * -> image
+ */
+ @Test
+ public void testEmbeddedOfficeFiles() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TrackingHandler handler;
+
+
+ // Excel with a word doc and a powerpoint doc, both of which have images in them
+ // Without recursion, should see both documents + the images
+ handler = process("testEXCEL_embeded.xls", extractor, false);
+ assertEquals(5, handler.filenames.size());
+ assertEquals(5, handler.mediaTypes.size());
+
+ // We don't know their filenames
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+ assertEquals("MBD00032A24.doc", handler.filenames.get(4));
+ // But we do know their types
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
+
+
+ // With recursion, should get the images embedded in the office files too
+ handler = process("testEXCEL_embeded.xls", extractor, true);
+ assertEquals(17, handler.filenames.size());
+ assertEquals(17, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+ assertEquals("1", handler.filenames.get(4));
+ assertEquals(null, handler.filenames.get(5));
+ assertEquals("2", handler.filenames.get(6));
+ assertEquals("image1.png", handler.filenames.get(7));
+ assertEquals("image2.jpg", handler.filenames.get(8));
+ assertEquals("image3.png", handler.filenames.get(9));
+ assertEquals("image1.png", handler.filenames.get(16));
+
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
+
+ // Word with .docx, powerpoint and excel
+ handler = process("testWORD_embeded.doc", extractor, false);
+ assertEquals(9, handler.filenames.size());
+ assertEquals(9, handler.mediaTypes.size());
+
+ // Filenames are a bit iffy...
+ // Should really be 3*embedded pictures then 3*icons then embedded docs
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals("image4.png", handler.filenames.get(1));
+ assertEquals("image5.jpg", handler.filenames.get(2));
+ assertEquals("image6.png", handler.filenames.get(3));
+ assertEquals("image2.emf", handler.filenames.get(4));
+ assertEquals("image3.emf", handler.filenames.get(5));
+ assertEquals(null, handler.filenames.get(6));
+ assertEquals("_1345471035.ppt", handler.filenames.get(7));
+ assertEquals("_1345470949.xls", handler.filenames.get(8));
+
+ // But we do know their types
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
+
+
+ // With recursion, should get their images too
+ handler = process("testWORD_embeded.doc", extractor, true);
+ assertEquals(16, handler.filenames.size());
+ assertEquals(16, handler.mediaTypes.size());
+
+ // We don't know their filenames, except for doc images + docx
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals("image4.png", handler.filenames.get(1));
+ assertEquals("image5.jpg", handler.filenames.get(2));
+ assertEquals("image6.png", handler.filenames.get(3));
+ assertEquals("image2.emf", handler.filenames.get(4));
+ assertEquals("image3.emf", handler.filenames.get(5));
+ assertEquals(null, handler.filenames.get(6));
+ assertEquals("image2.png", handler.filenames.get(7));
+ assertEquals("image3.jpeg", handler.filenames.get(8));
+ assertEquals("image4.png", handler.filenames.get(9));
+ for (int i = 11; i < 14; i++) {
+ assertNull(handler.filenames.get(i));
+ }
+ // But we do know their types
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls
+
+
+ // PowerPoint with excel and word
+ handler = process("testPPT_embeded.ppt", extractor, false);
+ assertEquals(7, handler.filenames.size());
+ assertEquals(7, handler.mediaTypes.size());
+
+ // We don't get all that helpful filenames
+ assertEquals("1", handler.filenames.get(0));
+ assertEquals("2", handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals(null, handler.filenames.get(3));
+ assertEquals(null, handler.filenames.get(4));
+ assertEquals(null, handler.filenames.get(5));
+ assertEquals(null, handler.filenames.get(6));
+ // But we do know their types
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
+
+ // Run again on PowerPoint but with recursion
+ handler = process("testPPT_embeded.ppt", extractor, true);
+ assertEquals(11, handler.filenames.size());
+ assertEquals(11, handler.mediaTypes.size());
+
+ assertEquals("1", handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals("2", handler.filenames.get(2));
+ assertEquals("image1.png", handler.filenames.get(3));
+ assertEquals("image2.jpg", handler.filenames.get(4));
+ assertEquals("image3.png", handler.filenames.get(5));
+ assertEquals(null, handler.filenames.get(6));
+ assertEquals(null, handler.filenames.get(7));
+ assertEquals(null, handler.filenames.get(8));
+ assertEquals(null, handler.filenames.get(9));
+ assertEquals(null, handler.filenames.get(10));
+
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .xls
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .docx
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside .docx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .docx
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
+
+
+ // Word, with a non-office file (PDF)
+ handler = process("testWORD_embedded_pdf.doc", extractor, true);
+ assertEquals(2, handler.filenames.size());
+ assertEquals(2, handler.mediaTypes.size());
+
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals("_1402837031.pdf", handler.filenames.get(1));
+
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
+ assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
+
+
+ // Outlook with a text file and a word document
+ handler = process("testMSG_att_doc.msg", extractor, true);
+ assertEquals(2, handler.filenames.size());
+ assertEquals(2, handler.mediaTypes.size());
+
+ assertEquals("test-unicode.doc", handler.filenames.get(0));
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
+
+ assertEquals("pj1.txt", handler.filenames.get(1));
+ assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
+
+
+ // Outlook with a pdf and another outlook message
+ handler = process("testMSG_att_msg.msg", extractor, true);
+ assertEquals(2, handler.filenames.size());
+ assertEquals(2, handler.mediaTypes.size());
+
+ assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+ assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
+
+ assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
+ assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+ }
+
+ @Test
+ public void testEmbeddedOfficeFilesXML() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TrackingHandler handler;
+
+ handler = process("EmbeddedDocument.docx", extractor, false);
+ assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
+ assertEquals(2, handler.filenames.size());
+ }
+
+ @Test
+ public void testPowerpointImages() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TrackingHandler handler;
+
+ handler = process("pictures.ppt", extractor, false);
+ assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
+ assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+ }
+
+ @Test
+ public void testEmbeddedStorageId() throws Exception {
+
+ List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
+ //.docx
+ assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
+ list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+ //_1345471035.ppt
+ assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
+ list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+ //_1345470949.xls
+ assertEquals("{00020820-0000-0000-C000-000000000046}",
+ list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+
+ }
+
+ @Test
+ public void testEmbeddedGraphChart() throws Exception {
+ //doc converts a chart to a actual xls file
+ //so we only need to look in ppt and xls
+ for (String suffix : new String[]{"ppt", "xls"}) {
+ List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
+ boolean found = false;
+ for (Metadata m : list) {
+ if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
+ found = true;
+ }
+ assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+ }
+ assertTrue("didn't find chart in "+suffix, found);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 32d462e..79d53d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -1,251 +1,251 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PowerPointParserTest extends TikaTest {
-
- @Test
- public void testPowerPointParser() throws Exception {
- try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
- "/test-documents/testPPT.ppt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/vnd.ms-powerpoint",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- String content = handler.toString();
- assertContains("Sample Powerpoint Slide", content);
- assertContains("Powerpoint X for Mac", content);
- }
- }
-
- @Test
- public void testVarious() throws Exception {
- Metadata metadata = new Metadata();
- String xml = getXML("testPPT_various.ppt", metadata).xml;
- assertContains("<p>Footnote appears here", xml);
- assertContains("<p>[1] This is a footnote.", xml);
- assertContains("<p>This is the header text.</p>", xml);
- assertContains("<p>This is the footer text.</p>", xml);
- assertContains("<p>Here is a text box</p>", xml);
- assertContains("<p>Bold ", xml);
- assertContains("italic underline superscript subscript", xml);
- assertContains("underline", xml);
- assertContains("superscript", xml);
- assertContains("subscript", xml);
- assertContains("<p>Here is a citation:", xml);
- assertContains("Figure 1 This is a caption for Figure 1", xml);
- assertContains("(Kramer)", xml);
- assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
- assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
- assertContains("<p>Row 1 column 1</p>", xml);
- assertContains("<p>Row 2 column 2</p>", xml);
- assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
- assertContains("<p>Here is a list:", xml);
- for(int row=1;row<=3;row++) {
- //assertContains("�\tBullet " + row, content);
- //assertContains("\u00b7\tBullet " + row, content);
- assertContains("<li>Bullet " + row, xml);
- }
- assertContains("Here is a numbered list:", xml);
- for(int row=1;row<=3;row++) {
- //assertContains(row + ")\tNumber bullet " + row, content);
- //assertContains(row + ") Number bullet " + row, content);
- // TODO: OOXMLExtractor fails to number the bullets:
- assertContains("<li>Number bullet " + row, xml);
- }
-
- for(int row=1;row<=2;row++) {
- for(int col=1;col<=3;col++) {
- assertContains("Row " + row + " Col " + col, xml);
- }
- }
- assertContains("Keyword1 Keyword2", xml);
- assertEquals("Keyword1 Keyword2",
- metadata.get(TikaCoreProperties.KEYWORDS));
-
- assertContains("Subject is here", xml);
- assertEquals("Subject is here",
- metadata.get(OfficeOpenXMLCore.SUBJECT));
- // TODO: Remove subject in Tika 2.0
- assertEquals("Subject is here",
- metadata.get(Metadata.SUBJECT));
-
- assertContains("Suddenly some Japanese text:", xml);
- // Special version of (GHQ)
- assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
- // 6 other characters
- assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
- xml);
-
- assertContains("And then some Gothic text:", xml);
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
- xml);
- }
-
- @Test
- public void testMasterFooter() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
- "/test-documents/testPPT_masterFooter.ppt")) {
- new OfficeParser().parse(stream, handler, metadata, new ParseContext());
- }
-
- String content = handler.toString();
- assertContains("Master footer is here", content);
-
- // Make sure boilerplate text didn't come through:
- assertEquals(-1, content.indexOf("Click to edit Master"));
-
- //TIKA-1171
- assertEquals(-1, content.indexOf("*"));
- }
-
- /**
- * TIKA-712 Master Slide Text from PPT and PPTX files
- * should be extracted too
- */
- @Test
- public void testMasterText() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
- "/test-documents/testPPT_masterText.ppt")) {
- new OfficeParser().parse(stream, handler, metadata, new ParseContext());
- }
-
- String content = handler.toString();
- assertContains("Text that I added to the master slide", content);
-
- // Make sure boilerplate text didn't come through:
- assertEquals(-1, content.indexOf("Click to edit Master"));
-
- //TIKA-1171
- assertEquals(-1, content.indexOf("*"));
- }
-
- @Test
- public void testMasterText2() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
- "/test-documents/testPPT_masterText2.ppt")) {
- new OfficeParser().parse(stream, handler, metadata, new ParseContext());
- }
-
- String content = handler.toString();
- assertContains("Text that I added to the master slide", content);
-
- // Make sure boilerplate text didn't come through:
- assertEquals(-1, content.indexOf("Click to edit Master"));
- //TIKA-1171
- assertEquals(-1, content.indexOf("*"));
- }
-
- /**
- * Ensures that custom OLE2 (HPSF) properties are extracted
- */
- @Test
- public void testCustomProperties() throws Exception {
- Metadata metadata = new Metadata();
-
- try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
- "/test-documents/testPPT_custom_props.ppt")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
- }
-
- assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
- assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
- assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
- assertEquals("1", metadata.get(Office.SLIDE_COUNT));
- assertEquals("3", metadata.get(Office.WORD_COUNT));
- assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
- }
-
- // TIKA-1025
- @Test
- public void testEmbeddedPlacedholder() throws Exception {
- XMLResult result = getXML("testPPT_embedded2.ppt");
- assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
- assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
- }
-
- // TIKA-817
- @Test
- public void testAutoDatePPT() throws Exception {
- //decision was made in POI-52367 not to generate
- //autodate automatically. For pptx, where value is stored,
- //value is extracted. For ppt, however, no date is extracted.
- XMLResult result = getXML("testPPT_autodate.ppt");
- assertContains(
- "<div class=\"slide-content\"><p>Now</p>",
- result.xml);
- }
-
- @Test
- public void testCommentAuthorship() throws Exception {
- XMLResult r = getXML("testPPT_comment.ppt");
- assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
- }
-
- @Test
- public void testEmbeddedPDF() throws Exception {
- List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
- assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
- assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
- assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
- assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PowerPointParserTest extends TikaTest {
+
+ @Test
+ public void testPowerPointParser() throws Exception {
+ try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT.ppt")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/vnd.ms-powerpoint",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("Powerpoint X for Mac", content);
+ }
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ Metadata metadata = new Metadata();
+ String xml = getXML("testPPT_various.ppt", metadata).xml;
+ assertContains("<p>Footnote appears here", xml);
+ assertContains("<p>[1] This is a footnote.", xml);
+ assertContains("<p>This is the header text.</p>", xml);
+ assertContains("<p>This is the footer text.</p>", xml);
+ assertContains("<p>Here is a text box</p>", xml);
+ assertContains("<p>Bold ", xml);
+ assertContains("italic underline superscript subscript", xml);
+ assertContains("underline", xml);
+ assertContains("superscript", xml);
+ assertContains("subscript", xml);
+ assertContains("<p>Here is a citation:", xml);
+ assertContains("Figure 1 This is a caption for Figure 1", xml);
+ assertContains("(Kramer)", xml);
+ assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+ assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+ assertContains("<p>Row 1 column 1</p>", xml);
+ assertContains("<p>Row 2 column 2</p>", xml);
+ assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+ assertContains("<p>Here is a list:", xml);
+ for(int row=1;row<=3;row++) {
+ //assertContains("�\tBullet " + row, content);
+ //assertContains("\u00b7\tBullet " + row, content);
+ assertContains("<li>Bullet " + row, xml);
+ }
+ assertContains("Here is a numbered list:", xml);
+ for(int row=1;row<=3;row++) {
+ //assertContains(row + ")\tNumber bullet " + row, content);
+ //assertContains(row + ") Number bullet " + row, content);
+ // TODO: OOXMLExtractor fails to number the bullets:
+ assertContains("<li>Number bullet " + row, xml);
+ }
+
+ for(int row=1;row<=2;row++) {
+ for(int col=1;col<=3;col++) {
+ assertContains("Row " + row + " Col " + col, xml);
+ }
+ }
+ assertContains("Keyword1 Keyword2", xml);
+ assertEquals("Keyword1 Keyword2",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+
+ assertContains("Subject is here", xml);
+ assertEquals("Subject is here",
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
+
+ assertContains("Suddenly some Japanese text:", xml);
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
+ // 6 other characters
+ assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
+ xml);
+
+ assertContains("And then some Gothic text:", xml);
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
+ xml);
+ }
+
+ @Test
+ public void testMasterFooter() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT_masterFooter.ppt")) {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ }
+
+ String content = handler.toString();
+ assertContains("Master footer is here", content);
+
+ // Make sure boilerplate text didn't come through:
+ assertEquals(-1, content.indexOf("Click to edit Master"));
+
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
+ }
+
+ /**
+ * TIKA-712 Master Slide Text from PPT and PPTX files
+ * should be extracted too
+ */
+ @Test
+ public void testMasterText() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT_masterText.ppt")) {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ }
+
+ String content = handler.toString();
+ assertContains("Text that I added to the master slide", content);
+
+ // Make sure boilerplate text didn't come through:
+ assertEquals(-1, content.indexOf("Click to edit Master"));
+
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
+ }
+
+ @Test
+ public void testMasterText2() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT_masterText2.ppt")) {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ }
+
+ String content = handler.toString();
+ assertContains("Text that I added to the master slide", content);
+
+ // Make sure boilerplate text didn't come through:
+ assertEquals(-1, content.indexOf("Click to edit Master"));
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
+ }
+
+ /**
+ * Ensures that custom OLE2 (HPSF) properties are extracted
+ */
+ @Test
+ public void testCustomProperties() throws Exception {
+ Metadata metadata = new Metadata();
+
+ try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT_custom_props.ppt")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OfficeParser().parse(input, handler, metadata, context);
+ }
+
+ assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
+ assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+ assertEquals("3", metadata.get(Office.WORD_COUNT));
+ assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ }
+
+ // TIKA-1025
+ @Test
+ public void testEmbeddedPlacedholder() throws Exception {
+ XMLResult result = getXML("testPPT_embedded2.ppt");
+ assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+ assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
+ }
+
+ // TIKA-817
+ @Test
+ public void testAutoDatePPT() throws Exception {
+ //decision was made in POI-52367 not to generate
+ //autodate automatically. For pptx, where value is stored,
+ //value is extracted. For ppt, however, no date is extracted.
+ XMLResult result = getXML("testPPT_autodate.ppt");
+ assertContains(
+ "<div class=\"slide-content\"><p>Now</p>",
+ result.xml);
+ }
+
+ @Test
+ public void testCommentAuthorship() throws Exception {
+ XMLResult r = getXML("testPPT_comment.ppt");
+ assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
+ }
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+ assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
index a3ccefc..a37e44d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
@@ -1,53 +1,53 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PublisherParserTest {
-
- @Test
- public void testPublisherParser() throws Exception {
- try (InputStream input = PublisherParserTest.class.getResourceAsStream(
- "/test-documents/testPUBLISHER.pub")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/x-mspublisher",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
- String content = handler.toString();
- assertContains("0123456789", content);
- assertContains("abcdef", content);
- }
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PublisherParserTest {
+
+ @Test
+ public void testPublisherParser() throws Exception {
+ try (InputStream input = PublisherParserTest.class.getResourceAsStream(
+ "/test-documents/testPUBLISHER.pub")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/x-mspublisher",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertContains("0123456789", content);
+ assertContains("abcdef", content);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
index 4edb5ee..8062555 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
@@ -1,98 +1,98 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest.TrackingHandler;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Tests for the TNEF (winmail.dat) parser
- */
-public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
- private static final String file = "testWINMAIL.dat";
-
- @Test
- public void testBasics() throws Exception {
- Detector detector = new DefaultDetector();
- try (TikaInputStream stream = getTestFile(file)) {
- assertEquals(
- MediaType.application("vnd.ms-tnef"),
- detector.detect(stream, new Metadata()));
- }
- }
-
- @Test
- public void testMetadata() throws Exception {
- TikaInputStream stream = getTestFile(file);
-
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- TNEFParser tnef = new TNEFParser();
- tnef.parse(stream, handler, metadata, new ParseContext());
-
- assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
- }
-
- /**
- * Check the Rtf and Attachments are returned
- * as expected
- */
- @Test
- public void testBodyAndAttachments() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
-
- // Process it with recursing
- // Will have the message body RTF and the attachments
- TrackingHandler handler = process(file, extractor, true);
- assertEquals(6, handler.filenames.size());
- assertEquals(6, handler.mediaTypes.size());
-
- // We know the filenames for all of them
- assertEquals("message.rtf", handler.filenames.get(0));
- assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
-
- assertEquals("quick.doc", handler.filenames.get(1));
- assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
-
- assertEquals("quick.html", handler.filenames.get(2));
- assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
-
- assertEquals("quick.pdf", handler.filenames.get(3));
- assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
-
- assertEquals("quick.txt", handler.filenames.get(4));
- assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
-
- assertEquals("quick.xml", handler.filenames.get(5));
- assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for the TNEF (winmail.dat) parser
+ */
+public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
+ private static final String file = "testWINMAIL.dat";
+
+ @Test
+ public void testBasics() throws Exception {
+ Detector detector = new DefaultDetector();
+ try (TikaInputStream stream = getTestFile(file)) {
+ assertEquals(
+ MediaType.application("vnd.ms-tnef"),
+ detector.detect(stream, new Metadata()));
+ }
+ }
+
+ @Test
+ public void testMetadata() throws Exception {
+ TikaInputStream stream = getTestFile(file);
+
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ TNEFParser tnef = new TNEFParser();
+ tnef.parse(stream, handler, metadata, new ParseContext());
+
+ assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+ }
+
+ /**
+ * Check the Rtf and Attachments are returned
+ * as expected
+ */
+ @Test
+ public void testBodyAndAttachments() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+
+ // Process it with recursing
+ // Will have the message body RTF and the attachments
+ TrackingHandler handler = process(file, extractor, true);
+ assertEquals(6, handler.filenames.size());
+ assertEquals(6, handler.mediaTypes.size());
+
+ // We know the filenames for all of them
+ assertEquals("message.rtf", handler.filenames.get(0));
+ assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+
+ assertEquals("quick.doc", handler.filenames.get(1));
+ assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
+
+ assertEquals("quick.html", handler.filenames.get(2));
+ assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+
+ assertEquals("quick.pdf", handler.filenames.get(3));
+ assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+
+ assertEquals("quick.txt", handler.filenames.get(4));
+ assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+
+ assertEquals("quick.xml", handler.filenames.get(5));
+ assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
index 3002187..06320fe 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
@@ -1,51 +1,51 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class VisioParserTest {
-
- @Test
- public void testVisioParser() throws Exception {
- try (InputStream input = VisioParserTest.class.getResourceAsStream(
- "/test-documents/testVISIO.vsd")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/vnd.visio",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
- String content = handler.toString();
- assertContains("Some random text, on a page", content);
- }
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class VisioParserTest {
+
+ @Test
+ public void testVisioParser() throws Exception {
+ try (InputStream input = VisioParserTest.class.getResourceAsStream(
+ "/test-documents/testVISIO.vsd")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/vnd.visio",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
+ String content = handler.toString();
+ assertContains("Some random text, on a page", content);
+ }
+ }
+
+}
[06/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
index 49afdd7..e1a0ff0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
@@ -1,54 +1,54 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Abstract class for recognizing a single charset.
- * Part of the implementation of ICU's CharsetDetector.
- *
- * Each specific charset that can be recognized will have an instance
- * of some subclass of this class. All interaction between the overall
- * CharsetDetector and the stuff specific to an individual charset happens
- * via the interface provided here.
- *
- * Instances of CharsetDetector DO NOT have or maintain
- * state pertaining to a specific match or detect operation.
- * The WILL be shared by multiple instances of CharsetDetector.
- * They encapsulate const charset-specific information.
- *
- * @internal
- */
-abstract class CharsetRecognizer {
- /**
- * Get the IANA name of this charset.
- * @return the charset name.
- */
- abstract String getName();
-
- /**
- * Get the ISO language code for this charset.
- * @return the language code, or <code>null</code> if the language cannot be determined.
- */
- public String getLanguage() {
- return null;
- }
-
- /**
- * Test the match of this charset with the input text data
- * which is obtained via the CharsetDetector object.
- *
- * @param det The CharsetDetector, which contains the input text
- * to be checked for being in this charset.
- * @return Two values packed into one int (Damn java, anyhow)
- * <br/>
- * bits 0-7: the match confidence, ranging from 0-100
- * <br/>
- * bits 8-15: The match reason, an enum-like value.
- */
- abstract int match(CharsetDetector det);
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Abstract class for recognizing a single charset.
+ * Part of the implementation of ICU's CharsetDetector.
+ *
+ * Each specific charset that can be recognized will have an instance
+ * of some subclass of this class. All interaction between the overall
+ * CharsetDetector and the stuff specific to an individual charset happens
+ * via the interface provided here.
+ *
+ * Instances of CharsetDetector DO NOT have or maintain
+ * state pertaining to a specific match or detect operation.
+ * The WILL be shared by multiple instances of CharsetDetector.
+ * They encapsulate const charset-specific information.
+ *
+ * @internal
+ */
+abstract class CharsetRecognizer {
+ /**
+ * Get the IANA name of this charset.
+ * @return the charset name.
+ */
+ abstract String getName();
+
+ /**
+ * Get the ISO language code for this charset.
+ * @return the language code, or <code>null</code> if the language cannot be determined.
+ */
+ public String getLanguage() {
+ return null;
+ }
+
+ /**
+ * Test the match of this charset with the input text data
+ * which is obtained via the CharsetDetector object.
+ *
+ * @param det The CharsetDetector, which contains the input text
+ * to be checked for being in this charset.
+ * @return Two values packed into one int (Damn java, anyhow)
+ * <br/>
+ * bits 0-7: the match confidence, ranging from 0-100
+ * <br/>
+ * bits 8-15: The match reason, an enum-like value.
+ */
+ abstract int match(CharsetDetector det);
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index f704557..2b20495 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -1,98 +1,98 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Plain text parser. The text encoding of the document stream is
- * automatically detected based on the byte patterns found at the
- * beginning of the stream and the given document metadata, most
- * notably the <code>charset</code> parameter of a
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
- * <p/>
- * This parser sets the following output metadata entries:
- * <dl>
- * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
- * <dd><code>text/plain; charset=...</code></dd>
- * </dl>
- */
-public class TXTParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -6656102320836888910L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.TEXT_PLAIN);
-
- private static final ServiceLoader LOADER =
- new ServiceLoader(TXTParser.class.getClassLoader());
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Automatically detect the character encoding
- try (AutoDetectReader reader = new AutoDetectReader(
- new CloseShieldInputStream(stream), metadata,
- context.get(ServiceLoader.class, LOADER))) {
- Charset charset = reader.getCharset();
- MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
- // deprecated, see TIKA-431
- metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- xhtml.startElement("p");
- char[] buffer = new char[4096];
- int n = reader.read(buffer);
- while (n != -1) {
- xhtml.characters(buffer, 0, n);
- n = reader.read(buffer);
- }
- xhtml.endElement("p");
-
- xhtml.endDocument();
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Plain text parser. The text encoding of the document stream is
+ * automatically detected based on the byte patterns found at the
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
+ * <p/>
+ * This parser sets the following output metadata entries:
+ * <dl>
+ * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
+ * <dd><code>text/plain; charset=...</code></dd>
+ * </dl>
+ */
+public class TXTParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6656102320836888910L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.TEXT_PLAIN);
+
+ private static final ServiceLoader LOADER =
+ new ServiceLoader(TXTParser.class.getClassLoader());
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata,
+ context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ xhtml.startElement("p");
+ char[] buffer = new char[4096];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ xhtml.characters(buffer, 0, n);
+ n = reader.read(buffer);
+ }
+ xhtml.endElement("p");
+
+ xhtml.endDocument();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
index 11bea1d..d36f79c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Base class for SAX handlers that map SAX events into document metadata.
- *
- * @since Apache Tika 0.10
- */
-class AbstractMetadataHandler extends DefaultHandler {
-
- private final Metadata metadata;
- private final Property property;
- private final String name;
-
- protected AbstractMetadataHandler(Metadata metadata, String name) {
- this.metadata = metadata;
- this.property = null;
- this.name = name;
- }
- protected AbstractMetadataHandler(Metadata metadata, Property property) {
- this.metadata = metadata;
- this.property = property;
- this.name = property.getName();
- }
-
- /**
- * Adds the given metadata value. The value is ignored if it is
- * <code>null</code> or empty. If the metadata entry already exists,
- * then the given value is appended to it with a comma as the separator.
- *
- * @param value metadata value
- */
- protected void addMetadata(String value) {
- if (value != null && value.length() > 0) {
- if (metadata.isMultiValued(name)) {
- // Add the value, assuming it's not already there
- List<String> previous = Arrays.asList(metadata.getValues(name));
- if (!previous.contains(value)) {
- if (property != null) {
- metadata.add(property, value);
- } else {
- metadata.add(name, value);
- }
- }
- } else {
- // Set the value, assuming it's not already there
- String previous = metadata.get(name);
- if (previous != null && previous.length() > 0) {
- if (!previous.equals(value)) {
- if (property != null) {
- if (property.isMultiValuePermitted()) {
- metadata.add(property, value);
- } else {
- // Replace the existing value if isMultiValuePermitted is false
- metadata.set(property, value);
- }
- } else {
- metadata.add(name, value);
- }
- }
- } else {
- if (property != null) {
- metadata.set(property, value);
- } else {
- metadata.set(name, value);
- }
- }
- }
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+ private final Property property;
+ private final String name;
+
+ protected AbstractMetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ protected AbstractMetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ /**
+ * Adds the given metadata value. The value is ignored if it is
+ * <code>null</code> or empty. If the metadata entry already exists,
+ * then the given value is appended to it with a comma as the separator.
+ *
+ * @param value metadata value
+ */
+ protected void addMetadata(String value) {
+ if (value != null && value.length() > 0) {
+ if (metadata.isMultiValued(name)) {
+ // Add the value, assuming it's not already there
+ List<String> previous = Arrays.asList(metadata.getValues(name));
+ if (!previous.contains(value)) {
+ if (property != null) {
+ metadata.add(property, value);
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ // Set the value, assuming it's not already there
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ if (!previous.equals(value)) {
+ if (property != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ // Replace the existing value if isMultiValuePermitted is false
+ metadata.set(property, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ if (property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
index 2c6b054..c1795fa 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
@@ -1,82 +1,82 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * This adds a Metadata entry for a given node.
- * The textual content of the node is used as the
- * value, and the Metadata name is taken from
- * an attribute, with a prefix if required.
- */
-public class AttributeDependantMetadataHandler extends DefaultHandler {
-
- private final Metadata metadata;
-
- private final String nameHoldingAttribute;
- private final String namePrefix;
- private String name;
-
- private final StringBuilder buffer = new StringBuilder();
-
- public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
- this.metadata = metadata;
- this.nameHoldingAttribute = nameHoldingAttribute;
- this.namePrefix = namePrefix;
- }
-
- public void addMetadata(String value) {
- if(name == null || name.length() == 0) {
- // We didn't find the attribute which holds the name
- return;
- }
- if (value.length() > 0) {
- String previous = metadata.get(name);
- if (previous != null && previous.length() > 0) {
- value = previous + ", " + value;
- }
- metadata.set(name, value);
- }
- }
-
- public void endElement(String uri, String localName, String name) {
- addMetadata(buffer.toString());
- buffer.setLength(0);
- }
-
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
- String rawName = attributes.getValue(nameHoldingAttribute);
- if (rawName != null) {
- if (namePrefix == null) {
- this.name = rawName;
- } else {
- this.name = namePrefix + rawName;
- }
- }
- // All other attributes are ignored
- }
-
-
- public void characters(char[] ch, int start, int length) {
- buffer.append(ch, start, length);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ * value, and the Metadata name is taken from
+ * an attribute, with a prefix if required.
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final String nameHoldingAttribute;
+ private final String namePrefix;
+ private String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+ this.metadata = metadata;
+ this.nameHoldingAttribute = nameHoldingAttribute;
+ this.namePrefix = namePrefix;
+ }
+
+ public void addMetadata(String value) {
+ if(name == null || name.length() == 0) {
+ // We didn't find the attribute which holds the name
+ return;
+ }
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+ metadata.set(name, value);
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ String rawName = attributes.getValue(nameHoldingAttribute);
+ if (rawName != null) {
+ if (namePrefix == null) {
+ this.name = rawName;
+ } else {
+ this.name = namePrefix + rawName;
+ }
+ }
+ // All other attributes are ignored
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
index 0140421..dba5e4c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
@@ -1,61 +1,61 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * SAX event handler that maps the contents of an XML attribute into
- * a metadata field.
- *
- * @since Apache Tika 0.10
- */
-public class AttributeMetadataHandler extends AbstractMetadataHandler {
-
- private final String uri;
-
- private final String localName;
-
- public AttributeMetadataHandler(
- String uri, String localName, Metadata metadata, String name) {
- super(metadata, name);
- this.uri = uri;
- this.localName = localName;
- }
- public AttributeMetadataHandler(
- String uri, String localName, Metadata metadata, Property property) {
- super(metadata, property);
- this.uri = uri;
- this.localName = localName;
- }
-
- @Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
- throws SAXException {
- for (int i = 0; i < attributes.getLength(); i++) {
- if (attributes.getURI(i).equals(this.uri)
- && attributes.getLocalName(i).equals(this.localName)) {
- addMetadata(attributes.getValue(i).trim());
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+ private final String uri;
+
+ private final String localName;
+
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ }
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, Property property) {
+ super(metadata, property);
+ this.uri = uri;
+ this.localName = localName;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ if (attributes.getURI(i).equals(this.uri)
+ && attributes.getLocalName(i).equals(this.localName)) {
+ addMetadata(attributes.getValue(i).trim());
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 9e27801..5999773 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -1,60 +1,60 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.TeeContentHandler;
-import org.xml.sax.ContentHandler;
-
-/**
- * Dublin Core metadata parser
- */
-public class DcXMLParser extends XMLParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 4905318835463880819L;
-
- private static ContentHandler getDublinCoreHandler(
- Metadata metadata, Property property, String element) {
- return new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, element,
- metadata, property);
- }
-
- protected ContentHandler getContentHandler(
- ContentHandler handler, Metadata metadata, ParseContext context) {
- return new TeeContentHandler(
- super.getContentHandler(handler, metadata, context),
- getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
- getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
- getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
- getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
- getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
- getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
- getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
- getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
- getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 4905318835463880819L;
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
index b69f65b..d5bfb1c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
@@ -1,255 +1,255 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.util.Arrays;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-
-/**
- * SAX event handler that maps the contents of an XML element into
- * a metadata field.
- *
- * @since Apache Tika 0.10
- */
-public class ElementMetadataHandler extends AbstractMetadataHandler {
- /**
- * Logger for this class
- */
- private static final Log logger = LogFactory
- .getLog(ElementMetadataHandler.class);
-
- private static final String LOCAL_NAME_RDF_BAG = "Bag";
- private static final String LOCAL_NAME_RDF_LI = "li";
- private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
- private final String uri;
-
- private final String localName;
-
- private final Metadata metadata;
-
- private final String name;
- private Property targetProperty;
-
- private final boolean allowDuplicateValues;
- private final boolean allowEmptyValues;
-
- /**
- * The buffer used to capture characters when inside a bag li element.
- */
- private final StringBuilder bufferBagged = new StringBuilder();
-
- /**
- * The buffer used to capture characters inside standard elements.
- */
- private final StringBuilder bufferBagless = new StringBuilder();
-
- /**
- * Whether or not the value was found in a standard element structure or inside a bag.
- */
- private boolean isBagless = true;
-
- private int matchLevel = 0;
- private int parentMatchLevel = 0;
-
- /**
- * Constructor for string metadata keys.
- *
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param name the Tika metadata field key
- */
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, String name) {
- super(metadata, name);
- this.uri = uri;
- this.localName = localName;
- this.metadata = metadata;
- this.name = name;
- this.allowDuplicateValues = false;
- this.allowEmptyValues = false;
- if (logger.isTraceEnabled()) {
- logger.trace("created simple handler for " + this.name);
- }
- }
-
- /**
- * Constructor for string metadata keys which allows change of behavior
- * for duplicate and empty entry values.
- *
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param name the Tika metadata field key
- * @param allowDuplicateValues add duplicate values to the Tika metadata
- * @param allowEmptyValues add empty values to the Tika metadata
- */
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
- super(metadata, name);
- this.uri = uri;
- this.localName = localName;
- this.metadata = metadata;
- this.name = name;
- this.allowDuplicateValues = allowDuplicateValues;
- this.allowEmptyValues = allowEmptyValues;
- if (logger.isTraceEnabled()) {
- logger.trace("created simple handler for " + this.name);
- }
- }
-
- /**
- * Constructor for Property metadata keys.
- *
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param targetProperty the Tika metadata Property key
- */
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, Property targetProperty) {
- super(metadata, targetProperty);
- this.uri = uri;
- this.localName = localName;
- this.metadata = metadata;
- this.targetProperty = targetProperty;
- this.name = targetProperty.getName();
- this.allowDuplicateValues = false;
- this.allowEmptyValues = false;
- if (logger.isTraceEnabled()) {
- logger.trace("created property handler for " + this.name);
- }
- }
-
- /**
- * Constructor for Property metadata keys which allows change of behavior
- * for duplicate and empty entry values.
- *
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param targetProperty the Tika metadata Property key
- * @param allowDuplicateValues add duplicate values to the Tika metadata
- * @param allowEmptyValues add empty values to the Tika metadata
- */
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
- super(metadata, targetProperty);
- this.uri = uri;
- this.localName = localName;
- this.metadata = metadata;
- this.targetProperty = targetProperty;
- this.name = targetProperty.getName();
- this.allowDuplicateValues = allowDuplicateValues;
- this.allowEmptyValues = allowEmptyValues;
- if (logger.isTraceEnabled()) {
- logger.trace("created property handler for " + this.name);
- }
- }
-
- protected boolean isMatchingParentElement(String uri, String localName) {
- return (uri.equals(this.uri) && localName.equals(this.localName));
- }
-
- protected boolean isMatchingElement(String uri, String localName) {
- // match if we're inside the parent element or within some bag element
- return (uri.equals(this.uri) && localName.equals(this.localName)) ||
- (parentMatchLevel > 0 &&
- ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
- (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
- )
- );
- }
-
- @Override
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
- if (isMatchingElement(uri, localName)) {
- matchLevel++;
- }
- if (isMatchingParentElement(uri, localName)) {
- parentMatchLevel++;
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String name) {
- if (isMatchingParentElement(uri, localName)) {
- parentMatchLevel--;
- }
- if (isMatchingElement(uri, localName)) {
- matchLevel--;
- if (matchLevel == 2) {
- // we're inside a bag li element, add the bagged buffer
- addMetadata(bufferBagged.toString().trim());
- bufferBagged.setLength(0);
- isBagless = false;
- }
- if (matchLevel == 0 && isBagless) {
- String valueBagless = bufferBagless.toString();
- if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
- // we're in a standard element, add the bagless buffer
- addMetadata(valueBagless.trim());
- bufferBagless.setLength(0);
- }
- isBagless = true;
- }
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) {
- // We need to append to both buffers since we don't if we're inside a bag until we're done
- if (parentMatchLevel > 0 && matchLevel > 2) {
- bufferBagged.append(ch, start, length);
- }
- if (parentMatchLevel > 0 && matchLevel > 0) {
- bufferBagless.append(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) {
- characters(ch, start, length);
- }
-
- @Override
- protected void addMetadata(String value) {
- if (logger.isTraceEnabled()) {
- logger.trace("adding " + name + "=" + value);
- }
- if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
- if ((value != null && value.length() > 0) || allowEmptyValues) {
- if (value == null || value.length() == 0 && allowEmptyValues) {
- value = "";
- }
- String[] previous = metadata.getValues(name);
- if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
- metadata.add(targetProperty, value);
- }
- }
- } else {
- super.addMetadata(value);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+ /**
+ * Logger for this class
+ */
+ private static final Log logger = LogFactory
+ .getLog(ElementMetadataHandler.class);
+
+ private static final String LOCAL_NAME_RDF_BAG = "Bag";
+ private static final String LOCAL_NAME_RDF_LI = "li";
+ private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+ private final String uri;
+
+ private final String localName;
+
+ private final Metadata metadata;
+
+ private final String name;
+ private Property targetProperty;
+
+ private final boolean allowDuplicateValues;
+ private final boolean allowEmptyValues;
+
+ /**
+ * The buffer used to capture characters when inside a bag li element.
+ */
+ private final StringBuilder bufferBagged = new StringBuilder();
+
+ /**
+ * The buffer used to capture characters inside standard elements.
+ */
+ private final StringBuilder bufferBagless = new StringBuilder();
+
+ /**
+ * Whether or not the value was found in a standard element structure or inside a bag.
+ */
+ private boolean isBagless = true;
+
+ private int matchLevel = 0;
+ private int parentMatchLevel = 0;
+
+ /**
+ * Constructor for string metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created simple handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for string metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created simple handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for Property metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created property handler for " + this.name);
+ }
+ }
+
+ /**
+ * Constructor for Property metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created property handler for " + this.name);
+ }
+ }
+
+ protected boolean isMatchingParentElement(String uri, String localName) {
+ return (uri.equals(this.uri) && localName.equals(this.localName));
+ }
+
+ protected boolean isMatchingElement(String uri, String localName) {
+ // match if we're inside the parent element or within some bag element
+ return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+ (parentMatchLevel > 0 &&
+ ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+ (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+ )
+ );
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ if (isMatchingElement(uri, localName)) {
+ matchLevel++;
+ }
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) {
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel--;
+ }
+ if (isMatchingElement(uri, localName)) {
+ matchLevel--;
+ if (matchLevel == 2) {
+ // we're inside a bag li element, add the bagged buffer
+ addMetadata(bufferBagged.toString().trim());
+ bufferBagged.setLength(0);
+ isBagless = false;
+ }
+ if (matchLevel == 0 && isBagless) {
+ String valueBagless = bufferBagless.toString();
+ if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+ // we're in a standard element, add the bagless buffer
+ addMetadata(valueBagless.trim());
+ bufferBagless.setLength(0);
+ }
+ isBagless = true;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ // We need to append to both buffers since we don't if we're inside a bag until we're done
+ if (parentMatchLevel > 0 && matchLevel > 2) {
+ bufferBagged.append(ch, start, length);
+ }
+ if (parentMatchLevel > 0 && matchLevel > 0) {
+ bufferBagless.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) {
+ characters(ch, start, length);
+ }
+
+ @Override
+ protected void addMetadata(String value) {
+ if (logger.isTraceEnabled()) {
+ logger.trace("adding " + name + "=" + value);
+ }
+ if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+ if ((value != null && value.length() > 0) || allowEmptyValues) {
+ if (value == null || value.length() == 0 && allowEmptyValues) {
+ value = "";
+ }
+ String[] previous = metadata.getValues(name);
+ if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+ metadata.add(targetProperty, value);
+ }
+ }
+ } else {
+ super.addMetadata(value);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
index 3c58c9e..e79bbfc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -1,117 +1,117 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Set;
-
-public class FictionBookParser extends XMLParser {
- private static final long serialVersionUID = 4195954546491524374L;
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(MediaType.application("x-fictionbook+xml"));
- }
-
- @Override
- protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
- if (ex == null) {
- ex = new ParsingEmbeddedDocumentExtractor(context);
- }
-
- return new BinaryElementsDataHandler(ex, handler);
- }
-
- private static class BinaryElementsDataHandler extends DefaultHandler {
- private static final String ELEMENT_BINARY = "binary";
-
- private boolean binaryMode = false;
- private static final String ATTRIBUTE_ID = "id";
-
- private final EmbeddedDocumentExtractor partExtractor;
- private final ContentHandler handler;
- private final StringBuilder binaryData = new StringBuilder();
- private Metadata metadata;
- private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
-
- private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
- this.partExtractor = partExtractor;
- this.handler = handler;
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
- binaryMode = ELEMENT_BINARY.equals(localName);
- if (binaryMode) {
- binaryData.setLength(0);
- metadata = new Metadata();
-
- metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
- metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (binaryMode) {
- try {
- partExtractor.parseEmbedded(
- new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
- handler,
- metadata,
- true
- );
- } catch (IOException e) {
- throw new SAXException("IOException in parseEmbedded", e);
- }
-
- binaryMode = false;
- binaryData.setLength(0);
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (!binaryMode) {
- handler.characters(ch, start, length);
- } else {
- binaryData.append(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- handler.ignorableWhitespace(ch, start, length);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+ private static final long serialVersionUID = 4195954546491524374L;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.application("x-fictionbook+xml"));
+ }
+
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ ex = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ return new BinaryElementsDataHandler(ex, handler);
+ }
+
+ private static class BinaryElementsDataHandler extends DefaultHandler {
+ private static final String ELEMENT_BINARY = "binary";
+
+ private boolean binaryMode = false;
+ private static final String ATTRIBUTE_ID = "id";
+
+ private final EmbeddedDocumentExtractor partExtractor;
+ private final ContentHandler handler;
+ private final StringBuilder binaryData = new StringBuilder();
+ private Metadata metadata;
+ private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+ private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+ this.partExtractor = partExtractor;
+ this.handler = handler;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ binaryMode = ELEMENT_BINARY.equals(localName);
+ if (binaryMode) {
+ binaryData.setLength(0);
+ metadata = new Metadata();
+
+ metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+ metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (binaryMode) {
+ try {
+ partExtractor.parseEmbedded(
+ new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+ handler,
+ metadata,
+ true
+ );
+ } catch (IOException e) {
+ throw new SAXException("IOException in parseEmbedded", e);
+ }
+
+ binaryMode = false;
+ binaryData.setLength(0);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (!binaryMode) {
+ handler.characters(ch, start, length);
+ } else {
+ binaryData.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ handler.ignorableWhitespace(ch, start, length);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
index edda097..3fee00a 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
@@ -1,85 +1,85 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * This adds Metadata entries with a specified name for
- * the textual content of a node (if present), and
- * all attribute values passed through the matcher
- * (but not their names).
- *
- * @deprecated Use the {@link AttributeMetadataHandler} and
- * {@link ElementMetadataHandler} classes instead
- */
-public class MetadataHandler extends DefaultHandler {
-
- private final Metadata metadata;
-
- private final Property property;
- private final String name;
-
- private final StringBuilder buffer = new StringBuilder();
-
- public MetadataHandler(Metadata metadata, String name) {
- this.metadata = metadata;
- this.property = null;
- this.name = name;
- }
- public MetadataHandler(Metadata metadata, Property property) {
- this.metadata = metadata;
- this.property = property;
- this.name = property.getName();
- }
-
- public void addMetadata(String value) {
- if (value.length() > 0) {
- String previous = metadata.get(name);
- if (previous != null && previous.length() > 0) {
- value = previous + ", " + value;
- }
-
- if (this.property != null) {
- metadata.set(property, value);
- } else {
- metadata.set(name, value);
- }
- }
- }
-
- public void endElement(String uri, String localName, String name) {
- addMetadata(buffer.toString());
- buffer.setLength(0);
- }
-
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
- for (int i = 0; i < attributes.getLength(); i++) {
- addMetadata(attributes.getValue(i));
- }
- }
-
-
- public void characters(char[] ch, int start, int length) {
- buffer.append(ch, start, length);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ * the textual content of a node (if present), and
+ * all attribute values passed through the matcher
+ * (but not their names).
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ * {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final Property property;
+ private final String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public MetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ public MetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ public void addMetadata(String value) {
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+
+ if (this.property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ addMetadata(attributes.getValue(i));
+ }
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
index 6e3d374..b17058d 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/XMLParser.java
@@ -1,89 +1,89 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.TaggedContentHandler;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * XML parser.
- */
-public class XMLParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -6028836725280212837L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("xml"),
- MediaType.image("svg+xml"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- if (metadata.get(Metadata.CONTENT_TYPE) == null) {
- metadata.set(Metadata.CONTENT_TYPE, "application/xml");
- }
-
- final XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.startElement("p");
-
- TaggedContentHandler tagged = new TaggedContentHandler(handler);
- try {
- context.getSAXParser().parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(new EmbeddedContentHandler(
- getContentHandler(tagged, metadata, context))));
- } catch (SAXException e) {
- tagged.throwIfCauseOf(e);
- throw new TikaException("XML parse error", e);
- } finally {
- xhtml.endElement("p");
- xhtml.endDocument();
- }
- }
-
- protected ContentHandler getContentHandler(
- ContentHandler handler, Metadata metadata, ParseContext context) {
- return new TextContentHandler(handler, true);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -6028836725280212837L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("xml"),
+ MediaType.image("svg+xml"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+ metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+ }
+
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
+ try {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))));
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TextContentHandler(handler, true);
+ }
+}
[07/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
index 2ccab7b..87f831b 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
@@ -1,1353 +1,1353 @@
-/*
- ****************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- ************************************************************************** *
- *
- */
-package org.apache.tika.parser.txt;
-
-import java.nio.ByteBuffer;
-
-/**
- * This class recognizes single-byte encodings. Because the encoding scheme is so
- * simple, language statistics are used to do the matching.
- * <p/>
- * The Recognizer works by first mapping from bytes in the encoding under test
- * into that Recognizer's ngram space. Normally this means performing a
- * lowercase, and excluding codepoints that don't correspond to numbers of
- * letters. (Accented letters may or may not be ignored or normalised, depending
- * on the needs of the ngrams)
- * Then, ngram analysis is run against the transformed text, and a confidence
- * is calculated.
- * <p/>
- * For many of our Recognizers, we have one ngram set per language in each
- * encoding, and do a simultanious language+charset detection.
- * <p/>
- * When adding new Recognizers, the easiest way is to byte map to an existing
- * encoding for which we have ngrams, excluding non text, and re-use the ngrams.
- *
- * @internal
- */
-abstract class CharsetRecog_sbcs extends CharsetRecognizer {
-
- protected boolean haveC1Bytes = false;
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#getName()
- */
- abstract String getName();
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
- */
- abstract int match(CharsetDetector det);
-
- int match(CharsetDetector det, int[] ngrams, byte[] byteMap) {
- return match(det, ngrams, byteMap, (byte) 0x20);
- }
-
- int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
- NGramParser parser = new NGramParser(ngrams, byteMap);
-
- haveC1Bytes = det.fC1Bytes;
-
- return parser.parse(det, spaceChar);
- }
-
- static class NGramParser {
- // private static final int N_GRAM_SIZE = 3;
- private static final int N_GRAM_MASK = 0xFFFFFF;
-
- private int byteIndex = 0;
- private int ngram = 0;
-
- private int[] ngramList;
- private byte[] byteMap;
-
- private int ngramCount;
- private int hitCount;
-
- private byte spaceChar;
-
- public NGramParser(int[] theNgramList, byte[] theByteMap) {
- ngramList = theNgramList;
- byteMap = theByteMap;
-
- ngram = 0;
-
- ngramCount = hitCount = 0;
- }
-
- /*
- * Binary search for value in table, which must have exactly 64 entries.
- */
- private static int search(int[] table, int value) {
- int index = 0;
-
- if (table[index + 32] <= value) {
- index += 32;
- }
-
- if (table[index + 16] <= value) {
- index += 16;
- }
-
- if (table[index + 8] <= value) {
- index += 8;
- }
-
- if (table[index + 4] <= value) {
- index += 4;
- }
-
- if (table[index + 2] <= value) {
- index += 2;
- }
-
- if (table[index + 1] <= value) {
- index += 1;
- }
-
- if (table[index] > value) {
- index -= 1;
- }
-
- if (index < 0 || table[index] != value) {
- return -1;
- }
-
- return index;
- }
-
- private void lookup(int thisNgram) {
- ngramCount += 1;
-
- if (search(ngramList, thisNgram) >= 0) {
- hitCount += 1;
- }
-
- }
-
- private void addByte(int b) {
- ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
- lookup(ngram);
- }
-
- private int nextByte(CharsetDetector det) {
- if (byteIndex >= det.fInputLen) {
- return -1;
- }
-
- return det.fInputBytes[byteIndex++] & 0xFF;
- }
-
- public int parse(CharsetDetector det) {
- return parse(det, (byte) 0x20);
- }
-
- public int parse(CharsetDetector det, byte spaceCh) {
- int b;
- boolean ignoreSpace = false;
- this.spaceChar = spaceCh;
-
- while ((b = nextByte(det)) >= 0) {
- byte mb = byteMap[b];
-
- // TODO: 0x20 might not be a space in all character sets...
- if (mb != 0) {
- if (!(mb == spaceChar && ignoreSpace)) {
- addByte(mb);
- }
-
- ignoreSpace = (mb == spaceChar);
- } else if (mb == 0 && b != 0) {
- // Indicates an invalid character in the charset
- // Bump the ngram count up a bit to indicate uncertainty
- ngramCount += 4;
- }
- }
-
- // TODO: Is this OK? The buffer could have ended in the middle of a word...
- addByte(spaceChar);
-
- double rawPercent = (double) hitCount / (double) ngramCount;
-
-// if (rawPercent <= 2.0) {
-// return 0;
-// }
-
- // TODO - This is a bit of a hack to take care of a case
- // were we were getting a confidence of 135...
- if (rawPercent > 0.33) {
- return 98;
- }
-
- return (int) (rawPercent * 300.0);
- }
- }
-
- abstract static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
-/* 0x00-0x07 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x10-0x17 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x18-0x1f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x20-0x27 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-/* 0x28-0x2f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x30-0x37 */ (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
-/* 0x38-0x3f */ (byte) 0x38, (byte) 0x39, (byte) 0x40, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x40-0x47 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-/* 0x48-0x4f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 0x50-0x57 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-/* 0x58-0x0f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x60-0x67 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-/* 0x68-0x6f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 0x70-0x77 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-/* 0x78-0x7f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x80-0x87 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x88-0x8f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x90-0x97 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x98-0x9f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xa0-0xa7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xa8-0xaf */ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xb0-0xb7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-/* 0xb8-0xbf */ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xc0-0xc7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-/* 0xc8-0xcf */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* 0xd0-0xd7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-/* 0xd8-0xdf */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
-/* 0xe0-0xe7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-/* 0xe8-0xef */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* 0xf0-0xf7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-/* 0xf8-0xff */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1252" : "ISO-8859-1";
- }
- }
-
- static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
- 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
- 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
- 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
- };
-
- public String getLanguage() {
- return "da";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
- 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
- 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
- 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
- };
-
- public String getLanguage() {
- return "de";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_en extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
- 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
- 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
- 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
- };
-
- public String getLanguage() {
- return "en";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_es extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
- 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
- 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
- 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
- };
-
- public String getLanguage() {
- return "es";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_fr extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
- 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
- 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
- 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
- };
-
- public String getLanguage() {
- return "fr";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_it extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
- 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
- 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
- 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
- };
-
- public String getLanguage() {
- return "it";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
- 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
- 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
- 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
- };
-
- public String getLanguage() {
- return "nl";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_no extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
- 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
- 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
- 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
- };
-
- public String getLanguage() {
- return "no";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
- 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
- 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
- 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
- };
-
- public String getLanguage() {
- return "pt";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_1_sv extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
- 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
- 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
- 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
- };
-
- public String getLanguage() {
- return "sv";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0x20,
- (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
- (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
- (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1250" : "ISO-8859-2";
- }
- }
-
- static class CharsetRecog_8859_2_cs extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
- 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
- 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
- 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
- };
-
- public String getLanguage() {
- return "cs";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_2_hu extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
- 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
- 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
- 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
- };
-
- public String getLanguage() {
- return "hu";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_2_pl extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
- 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
- 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
- 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
- };
-
- public String getLanguage() {
- return "pl";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_2_ro extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
- 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
- 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
- 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
- };
-
- public String getLanguage() {
- return "ro";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
- };
-
- public String getName() {
- return "ISO-8859-5";
- }
- }
-
- static class CharsetRecog_8859_5_ru extends CharsetRecog_8859_5 {
- private static int[] ngrams = {
- 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
- 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
- 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
- 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
- };
-
- public String getLanguage() {
- return "ru";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_6 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
- (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- };
-
- public String getName() {
- return "ISO-8859-6";
- }
- }
-
- static class CharsetRecog_8859_6_ar extends CharsetRecog_8859_6 {
- private static int[] ngrams = {
- 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
- 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
- 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
- 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
- };
-
- public String getLanguage() {
- return "ar";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_7 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0xA1, (byte) 0xA2, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xDC, (byte) 0x20,
- (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, (byte) 0x20, (byte) 0xFC, (byte) 0x20, (byte) 0xFD, (byte) 0xFE,
- (byte) 0xC0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0x20, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1253" : "ISO-8859-7";
- }
- }
-
- static class CharsetRecog_8859_7_el extends CharsetRecog_8859_7 {
- private static int[] ngrams = {
- 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
- 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
- 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
- 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
- };
-
- public String getLanguage() {
- return "el";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_8 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1255" : "ISO-8859-8";
- }
- }
-
- static class CharsetRecog_8859_8_I_he extends CharsetRecog_8859_8 {
- private static int[] ngrams = {
- 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
- 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
- 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
- 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1255" : /*"ISO-8859-8-I"*/ "ISO-8859-8";
- }
-
- public String getLanguage() {
- return "he";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8 {
- private static int[] ngrams = {
- 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
- 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
- 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
- 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
- };
-
- public String getLanguage() {
- return "he";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_8859_9 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x69, (byte) 0xFE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- };
-
- public String getName() {
- return haveC1Bytes ? "windows-1254" : "ISO-8859-9";
- }
- }
-
- static class CharsetRecog_8859_9_tr extends CharsetRecog_8859_9 {
- private static int[] ngrams = {
- 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
- 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
- 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
- 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
- };
-
- public String getLanguage() {
- return "tr";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_windows_1251 extends CharsetRecog_sbcs {
- private static int[] ngrams = {
- 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
- 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
- 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
- 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
- };
-
- private static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x90, (byte) 0x83, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
- (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
- (byte) 0x20, (byte) 0xA2, (byte) 0xA2, (byte) 0xBC, (byte) 0x20, (byte) 0xB4, (byte) 0x20, (byte) 0x20,
- (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xBF,
- (byte) 0x20, (byte) 0x20, (byte) 0xB3, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
- (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0xBC, (byte) 0xBE, (byte) 0xBE, (byte) 0xBF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- };
-
- public String getName() {
- return "windows-1251";
- }
-
- public String getLanguage() {
- return "ru";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs {
- private static int[] ngrams = {
- 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
- 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
- 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
- 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
- };
-
- // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged
- private static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- };
-
- public String getName() {
- return "IBM866";
- }
-
- public String getLanguage() {
- return "ru";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_windows_1256 extends CharsetRecog_sbcs {
- private static int[] ngrams = {
- 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
- 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
- 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
- 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
- };
-
- private static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x81, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x88, (byte) 0x20, (byte) 0x8A, (byte) 0x20, (byte) 0x9C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
- (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x98, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x20, (byte) 0x20, (byte) 0x9F,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
- (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0x20,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xF4, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0xF9, (byte) 0x20, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0x20, (byte) 0xFF,
- };
-
- public String getName() {
- return "windows-1256";
- }
-
- public String getLanguage() {
- return "ar";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_KOI8_R extends CharsetRecog_sbcs {
- private static int[] ngrams = {
- 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
- 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
- 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
- 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
- };
-
- private static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
- (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
- (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
- (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
- (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
- };
-
- public String getName() {
- return "KOI8-R";
- }
-
- public String getLanguage() {
- return "ru";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
-/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
-/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */ (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 7- */ (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40,
-/* 8- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 9- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* B- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
- };
-
- public String getLanguage() {
- return "he";
- }
- }
-
- static class CharsetRecog_IBM424_he_rtl extends CharsetRecog_IBM424_he {
- private static int[] ngrams = {
- 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
- 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
- 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
- 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
- };
-
- public String getName() {
- return "IBM424_rtl";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap, (byte) 0x40);
- }
- }
-
- static class CharsetRecog_IBM424_he_ltr extends CharsetRecog_IBM424_he {
- private static int[] ngrams = {
- 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
- 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
- 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
- 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651
-
- };
-
- public String getName() {
- return "IBM424_ltr";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap, (byte) 0x40);
- }
- }
-
- abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
-/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
-/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x40, (byte) 0x40, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 7- */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x78, (byte) 0x79, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
-/* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
-/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
-/* B- */ (byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x40, (byte) 0x40, (byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
-/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0xCB, (byte) 0x40, (byte) 0xCD, (byte) 0x40, (byte) 0xCF,
-/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
-/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40,
- };
- protected static byte[] unshapeMap = {
-/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
-/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
-/* 5- */ (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
-/* 6- */ (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 7- */ (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte)
<TRUNCATED>
[05/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 6d1c99a..9d9a138 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -1,274 +1,274 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.txt;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.ByteArrayInputStream;
-import java.io.StringWriter;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TXTParserTest extends TikaTest {
-
- private Parser parser = new TXTParser();
-
- @Test
- public void testEnglishText() throws Exception {
- String text =
- "Hello, World! This is simple UTF-8 text content written"
- + " in English to test autodetection of both the character"
- + " encoding and the language of the input stream.";
-
- Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- parser.parse(
- new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
- String content = writer.toString();
-
- assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
-
- // TIKA-501: Remove language detection from TXTParser
- assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
- assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
-
- assertContains("Hello", content);
- assertContains("World", content);
- assertContains("autodetection", content);
- assertContains("stream", content);
- }
-
- @Test
- public void testUTF8Text() throws Exception {
- String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
-
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(text.getBytes(UTF_8)),
- handler, metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
- assertContains(text, handler.toString());
- }
-
- @Test
- public void testEmptyText() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("\n", handler.toString());
- }
-
- /**
- * Test for the heuristics that we use to assign an eight-bit character
- * encoding to mostly ASCII sequences. If a more specific match can not
- * be made, a string with a CR(LF) in it is most probably windows-1252,
- * otherwise ISO-8859-1, except if it contains the currency/euro symbol
- * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
- */
- @Test
- public void testLatinDetectionHeuristics() throws Exception {
- String windows = "test\r\n";
- String unix = "test\n";
- String euro = "test \u20ac\n";
-
- Metadata metadata;
-
- metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
- new DefaultHandler(), metadata, new ParseContext());
- assertEquals(
- "text/plain; charset=windows-1252",
- metadata.get(Metadata.CONTENT_TYPE));
-
- metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
- new DefaultHandler(), metadata, new ParseContext());
- assertEquals(
- "text/plain; charset=ISO-8859-1",
- metadata.get(Metadata.CONTENT_TYPE));
-
- metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
- new DefaultHandler(), metadata, new ParseContext());
- assertEquals(
- "text/plain; charset=ISO-8859-15",
- metadata.get(Metadata.CONTENT_TYPE));
- }
-
- /**
- * Test case for TIKA-240: Drop the BOM when extracting plain text
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
- */
- @Test
- public void testDropByteOrderMark() throws Exception {
- assertExtractText("UTF-8 BOM", "test", new byte[]{
- (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
- assertExtractText("UTF-16 BE BOM", "test", new byte[]{
- (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
- assertExtractText("UTF-16 LE BOM", "test", new byte[]{
- (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
- }
-
- /**
- * Test case for TIKA-335: using incoming charset
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
- */
- @Test
- public void testUseIncomingCharsetAsHint() throws Exception {
- // Could be ISO 8859-1 or ISO 8859-15 or ...
- // u00e1 is latin small letter a with acute
- final String test2 = "the name is \u00e1ndre";
-
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
- metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
- parser.parse(
- new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
- }
-
- /**
- * Test case for TIKA-341: using charset in content-type
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
- */
- @Test
- public void testUsingCharsetInContentTypeHeader() throws Exception {
- // Could be ISO 8859-1 or ISO 8859-15 or ...
- // u00e1 is latin small letter a with acute
- final String test2 = "the name is \u00e1ndre";
-
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
-
- metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
- parser.parse(
- new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
- }
-
- private void assertExtractText(String msg, String expected, byte[] input)
- throws Exception {
- ContentHandler handler = new BodyContentHandler() {
- public void ignorableWhitespace(char[] ch, int off, int len) {
- // Ignore the whitespace added by XHTMLContentHandler
- }
- };
- Metadata metadata = new Metadata();
- parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
- assertEquals(msg, expected, handler.toString());
- }
-
- /**
- * Test case for TIKA-339: don't override incoming language
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
- */
- @Test
- public void testRetainIncomingLanguage() throws Exception {
- final String test = "Simple Content";
-
- Metadata metadata = new Metadata();
- metadata.set(TikaCoreProperties.LANGUAGE, "en");
-
- parser.parse(
- new ByteArrayInputStream(test.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
- }
-
- @Test
- public void testCP866() throws Exception {
- XMLResult r = getXML("russian.cp866.txt", parser);
- assertEquals("text/plain; charset=IBM866", r.metadata.get(Metadata.CONTENT_TYPE));
- }
-
- @Test
- public void testEBCDIC_CP500() throws Exception {
- XMLResult r = getXML("english.cp500.txt", parser);
- assertEquals("text/plain; charset=IBM500", r.metadata.get(Metadata.CONTENT_TYPE));
-
- // Additional check that it isn't too eager on short blocks of text
- r = getXML(
- new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
- parser, new Metadata());
-
- assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
- }
-
- /**
- * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
- */
- @Test
- public void testCharsetDetectionWithShortSnipet() throws Exception {
- final String text = "Hello, World!";
- XMLResult r = getXML(
- new ByteArrayInputStream(text.getBytes(UTF_8)), parser, new Metadata());
- assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
-
- // Now verify that if we tell the parser the encoding is UTF-8, that's what
- // we get back (see TIKA-868)
- r.metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
- parser.parse(
- new ByteArrayInputStream(text.getBytes(UTF_8)),
- new BodyContentHandler(), r.metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TXTParserTest extends TikaTest {
+
+ private Parser parser = new TXTParser();
+
+ @Test
+ public void testEnglishText() throws Exception {
+ String text =
+ "Hello, World! This is simple UTF-8 text content written"
+ + " in English to test autodetection of both the character"
+ + " encoding and the language of the input stream.";
+
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+ String content = writer.toString();
+
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+ // TIKA-501: Remove language detection from TXTParser
+ assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
+ assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
+
+ assertContains("Hello", content);
+ assertContains("World", content);
+ assertContains("autodetection", content);
+ assertContains("stream", content);
+ }
+
+ @Test
+ public void testUTF8Text() throws Exception {
+ String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(UTF_8)),
+ handler, metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ assertContains(text, handler.toString());
+ }
+
+ @Test
+ public void testEmptyText() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("\n", handler.toString());
+ }
+
+ /**
+ * Test for the heuristics that we use to assign an eight-bit character
+ * encoding to mostly ASCII sequences. If a more specific match can not
+ * be made, a string with a CR(LF) in it is most probably windows-1252,
+ * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+ * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+ */
+ @Test
+ public void testLatinDetectionHeuristics() throws Exception {
+ String windows = "test\r\n";
+ String unix = "test\n";
+ String euro = "test \u20ac\n";
+
+ Metadata metadata;
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=windows-1252",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-1",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-15",
+ metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Test case for TIKA-240: Drop the BOM when extracting plain text
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
+ */
+ @Test
+ public void testDropByteOrderMark() throws Exception {
+ assertExtractText("UTF-8 BOM", "test", new byte[]{
+ (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
+ assertExtractText("UTF-16 BE BOM", "test", new byte[]{
+ (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
+ assertExtractText("UTF-16 LE BOM", "test", new byte[]{
+ (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
+ }
+
+ /**
+ * Test case for TIKA-335: using incoming charset
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ @Test
+ public void testUseIncomingCharsetAsHint() throws Exception {
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+ }
+
+ /**
+ * Test case for TIKA-341: using charset in content-type
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+ */
+ @Test
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
+ }
+
+ private void assertExtractText(String msg, String expected, byte[] input)
+ throws Exception {
+ ContentHandler handler = new BodyContentHandler() {
+ public void ignorableWhitespace(char[] ch, int off, int len) {
+ // Ignore the whitespace added by XHTMLContentHandler
+ }
+ };
+ Metadata metadata = new Metadata();
+ parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
+ assertEquals(msg, expected, handler.toString());
+ }
+
+ /**
+ * Test case for TIKA-339: don't override incoming language
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
+ */
+ @Test
+ public void testRetainIncomingLanguage() throws Exception {
+ final String test = "Simple Content";
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.LANGUAGE, "en");
+
+ parser.parse(
+ new ByteArrayInputStream(test.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+ }
+
+ @Test
+ public void testCP866() throws Exception {
+ XMLResult r = getXML("russian.cp866.txt", parser);
+ assertEquals("text/plain; charset=IBM866", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testEBCDIC_CP500() throws Exception {
+ XMLResult r = getXML("english.cp500.txt", parser);
+ assertEquals("text/plain; charset=IBM500", r.metadata.get(Metadata.CONTENT_TYPE));
+
+ // Additional check that it isn't too eager on short blocks of text
+ r = getXML(
+ new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
+ parser, new Metadata());
+
+ assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
+ */
+ @Test
+ public void testCharsetDetectionWithShortSnipet() throws Exception {
+ final String text = "Hello, World!";
+ XMLResult r = getXML(
+ new ByteArrayInputStream(text.getBytes(UTF_8)), parser, new Metadata());
+ assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+
+ // Now verify that if we tell the parser the encoding is UTF-8, that's what
+ // we get back (see TIKA-868)
+ r.metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes(UTF_8)),
+ new BodyContentHandler(), r.metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
index 665151d..2458963 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
@@ -1,87 +1,87 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class DcXMLParserTest extends TikaTest {
-
- @Test
- public void testXMLParserAsciiChars() throws Exception {
- XMLResult result = getXML("testXML.xml", new DcXMLParser());
- Metadata metadata = result.metadata;
- assertEquals(
- "application/xml",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
-
- // The file contains 5 dc:subject tags, which come through as
- // a multi-valued Tika Metadata entry in file order
- assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
- assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
- assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
- assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
- assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
- assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
- assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
- assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
- assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
- assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
- assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
- assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
- assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
- assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
-
- assertEquals(
- "Framework d\'indexation des documents XML, HTML, PDF etc..",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals(
- "http://www.apache.org",
- metadata.get(TikaCoreProperties.IDENTIFIER));
- assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
- assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
- assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
- assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
-
- assertContains("Tika test document", result.xml);
-
- assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
-
- }
-
- @Test
- public void testXMLParserNonAsciiChars() throws Exception {
- XMLResult r = getXML("testXML.xml", new DcXMLParser());
- final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
- assertEquals(expected, r.metadata.get(TikaCoreProperties.RIGHTS));
- }
-
- // TIKA-1048
- @Test
- public void testNoSpaces() throws Exception {
- String text = getXML("testXML2.xml").xml;
- assertFalse(text.contains("testSubject"));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class DcXMLParserTest extends TikaTest {
+
+ @Test
+ public void testXMLParserAsciiChars() throws Exception {
+ XMLResult result = getXML("testXML.xml", new DcXMLParser());
+ Metadata metadata = result.metadata;
+ assertEquals(
+ "application/xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+ // The file contains 5 dc:subject tags, which come through as
+ // a multi-valued Tika Metadata entry in file order
+ assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+ assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+ assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+ assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+ assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+ assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+ assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+ assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+ assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+ assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+ assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+ assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+ assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+ assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+ assertEquals(
+ "Framework d\'indexation des documents XML, HTML, PDF etc..",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals(
+ "http://www.apache.org",
+ metadata.get(TikaCoreProperties.IDENTIFIER));
+ assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+ assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+ assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+ assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+ assertContains("Tika test document", result.xml);
+
+ assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+
+ }
+
+ @Test
+ public void testXMLParserNonAsciiChars() throws Exception {
+ XMLResult r = getXML("testXML.xml", new DcXMLParser());
+ final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+ assertEquals(expected, r.metadata.get(TikaCoreProperties.RIGHTS));
+ }
+
+ // TIKA-1048
+ @Test
+ public void testNoSpaces() throws Exception {
+ String text = getXML("testXML2.xml").xml;
+ assertFalse(text.contains("testSubject"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
index 536f9d7..39e15d3 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
@@ -1,116 +1,116 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.TeeContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
-
- private Property FIRST_NAME = Property.internalTextBag(
- "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
- private Property LAST_NAME = Property.internalTextBag(
- "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
-
- @Test
- public void testDefaultBehavior() throws Exception {
- XMLResult r = getXML("testXML3.xml", new DefaultCustomXMLTestParser());
- Metadata metadata = r.metadata;
-
- assertEquals(4, metadata.getValues(FIRST_NAME).length);
- assertEquals(2, metadata.getValues(LAST_NAME).length);
-
- assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
-
- assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
- assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
-
- // We didn't know Bob's last name, but now we don't know an entry existed
- assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
-
- // We don't know Kate's last name because it was a duplicate
- assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
- }
-
- @Test
- public void testEmptiesAndRepeats() throws Exception {
- XMLResult r = getXML("testXML3.xml", new AllowEmptiesAndDuplicatesCustomXMLTestParser());
- Metadata metadata = r.metadata;
-
- assertEquals(4, metadata.getValues(FIRST_NAME).length);
- assertEquals(4, metadata.getValues(LAST_NAME).length);
-
- assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
-
- assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
- assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
-
- assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
- assertEquals("", metadata.getValues(LAST_NAME)[2]);
-
- assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
-
- }
-
- private class DefaultCustomXMLTestParser extends XMLParser {
-
- private static final long serialVersionUID = 2458579047014545931L;
-
- protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
- return new ElementMetadataHandler(
- "http://custom",
- localPart,
- metadata,
- tikaProperty);
- }
-
- protected ContentHandler getContentHandler(
- ContentHandler handler, Metadata metadata, ParseContext context) {
- return new TeeContentHandler(
- super.getContentHandler(handler, metadata, context),
- getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
- getCustomElementHandler(metadata, LAST_NAME, "LastName"));
- }
- }
-
- private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
-
- private static final long serialVersionUID = 3735646809954466229L;
-
- protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
- return new ElementMetadataHandler(
- "http://custom",
- localPart,
- metadata,
- tikaProperty,
- true,
- true);
- }
- }
-
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+
+ private Property FIRST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+ private Property LAST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+ @Test
+ public void testDefaultBehavior() throws Exception {
+ XMLResult r = getXML("testXML3.xml", new DefaultCustomXMLTestParser());
+ Metadata metadata = r.metadata;
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ // We didn't know Bob's last name, but now we don't know an entry existed
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+ // We don't know Kate's last name because it was a duplicate
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ }
+
+ @Test
+ public void testEmptiesAndRepeats() throws Exception {
+ XMLResult r = getXML("testXML3.xml", new AllowEmptiesAndDuplicatesCustomXMLTestParser());
+ Metadata metadata = r.metadata;
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+
+ }
+
+ private class DefaultCustomXMLTestParser extends XMLParser {
+
+ private static final long serialVersionUID = 2458579047014545931L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+ getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+ }
+ }
+
+ private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+
+ private static final long serialVersionUID = 3735646809954466229L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty,
+ true,
+ true);
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
index aee7307..8ee966c 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xml;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Test;
-
-public class FictionBookParserTest extends TikaTest {
-
- @Test
- public void testFB2() throws Exception {
- XMLResult r = getXML("test.fb2", new FictionBookParser(), new Metadata(), new ParseContext());
- assertContains("1812", r.xml);
- }
-
- @Test
- public void testEmbedded() throws Exception {
- try (InputStream input = getTestDocumentAsStream("test.fb2")) {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TikaInputStream stream = TikaInputStream.get(input);
-
- assertEquals(true, extractor.isSupported(stream));
-
- // Process it
- TrackingHandler handler = new TrackingHandler();
- extractor.extract(stream, null, handler);
-
- assertEquals(2, handler.filenames.size());
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class FictionBookParserTest extends TikaTest {
+
+ @Test
+ public void testFB2() throws Exception {
+ XMLResult r = getXML("test.fb2", new FictionBookParser(), new Metadata(), new ParseContext());
+ assertContains("1812", r.xml);
+ }
+
+ @Test
+ public void testEmbedded() throws Exception {
+ try (InputStream input = getTestDocumentAsStream("test.fb2")) {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TikaInputStream stream = TikaInputStream.get(input);
+
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ extractor.extract(stream, null, handler);
+
+ assertEquals(2, handler.filenames.size());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/pom.xml b/tika-parser-modules/tika-parser-web-module/pom.xml
index 53aadb2..ee9e24c 100644
--- a/tika-parser-modules/tika-parser-web-module/pom.xml
+++ b/tika-parser-modules/tika-parser-web-module/pom.xml
@@ -1,89 +1,89 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-web-module</artifactId>
- <name>Apache Tika parser web module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <mime4j.version>0.7.2</mime4j.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
- </dependency>
- <dependency>
- <groupId>de.l3s.boilerpipe</groupId>
- <artifactId>boilerpipe</artifactId>
- <version>1.1.0</version>
- </dependency>
- <dependency>
- <groupId>com.rometools</groupId>
- <artifactId>rome</artifactId>
- <version>1.5.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-core</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-dom</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
-</project>
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-web-module</artifactId>
+ <name>Apache Tika parser web module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <mime4j.version>0.7.2</mime4j.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>de.l3s.boilerpipe</groupId>
+ <artifactId>boilerpipe</artifactId>
+ <version>1.1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>com.rometools</groupId>
+ <artifactId>rome</artifactId>
+ <version>1.5.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-core</artifactId>
+ <version>${mime4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-dom</artifactId>
+ <version>${mime4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
index 53e28ca..4c728cc 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/module/web/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.web.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.web.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
index b69e677..428ff83 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
@@ -1,127 +1,127 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.feed;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import com.rometools.rome.feed.synd.SyndContent;
-import com.rometools.rome.feed.synd.SyndEntry;
-import com.rometools.rome.feed.synd.SyndFeed;
-import com.rometools.rome.io.FeedException;
-import com.rometools.rome.io.SyndFeedInput;
-
-/**
- * Feed parser.
- * <p>
- * Uses Rome for parsing the feeds. A feed description is put in a paragraph
- * with its link and title in an anchor.
- */
-public class FeedParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -3785361933034525186L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("rss+xml"),
- MediaType.application("atom+xml"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // set the encoding?
- try {
- SyndFeed feed = new SyndFeedInput().build(
- new InputSource(new CloseShieldInputStream(stream)));
-
- String title = stripTags(feed.getTitleEx());
- String description = stripTags(feed.getDescriptionEx());
-
- metadata.set(TikaCoreProperties.TITLE, title);
- metadata.set(TikaCoreProperties.DESCRIPTION, description);
- // store the other fields in the metadata
-
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- xhtml.element("h1", title);
- xhtml.element("p", description);
-
- xhtml.startElement("ul");
- for (Object e : feed.getEntries()) {
- SyndEntry entry = (SyndEntry) e;
- String link = entry.getLink();
- if (link != null) {
- xhtml.startElement("li");
- xhtml.startElement("a", "href", link);
- xhtml.characters(stripTags(entry.getTitleEx()));
- xhtml.endElement("a");
- SyndContent content = entry.getDescription();
- if (content != null) {
- xhtml.newline();
- xhtml.characters(stripTags(content));
- }
- xhtml.endElement("li");
- }
- }
- xhtml.endElement("ul");
-
- xhtml.endDocument();
- } catch (FeedException e) {
- throw new TikaException("RSS parse error", e);
- }
-
- }
-
- private static String stripTags(SyndContent c) {
- if (c == null)
- return "";
-
- String value = c.getValue();
-
- String[] parts = value.split("<[^>]*>");
- StringBuffer buf = new StringBuffer();
-
- for (String part : parts)
- buf.append(part);
-
- return buf.toString().trim();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.rometools.rome.feed.synd.SyndContent;
+import com.rometools.rome.feed.synd.SyndEntry;
+import com.rometools.rome.feed.synd.SyndFeed;
+import com.rometools.rome.io.FeedException;
+import com.rometools.rome.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3785361933034525186L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("rss+xml"),
+ MediaType.application("atom+xml"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // set the encoding?
+ try {
+ SyndFeed feed = new SyndFeedInput().build(
+ new InputSource(new CloseShieldInputStream(stream)));
+
+ String title = stripTags(feed.getTitleEx());
+ String description = stripTags(feed.getDescriptionEx());
+
+ metadata.set(TikaCoreProperties.TITLE, title);
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ // store the other fields in the metadata
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ xhtml.element("h1", title);
+ xhtml.element("p", description);
+
+ xhtml.startElement("ul");
+ for (Object e : feed.getEntries()) {
+ SyndEntry entry = (SyndEntry) e;
+ String link = entry.getLink();
+ if (link != null) {
+ xhtml.startElement("li");
+ xhtml.startElement("a", "href", link);
+ xhtml.characters(stripTags(entry.getTitleEx()));
+ xhtml.endElement("a");
+ SyndContent content = entry.getDescription();
+ if (content != null) {
+ xhtml.newline();
+ xhtml.characters(stripTags(content));
+ }
+ xhtml.endElement("li");
+ }
+ }
+ xhtml.endElement("ul");
+
+ xhtml.endDocument();
+ } catch (FeedException e) {
+ throw new TikaException("RSS parse error", e);
+ }
+
+ }
+
+ private static String stripTags(SyndContent c) {
+ if (c == null)
+ return "";
+
+ String value = c.getValue();
+
+ String[] parts = value.split("<[^>]*>");
+ StringBuffer buf = new StringBuffer();
+
+ for (String part : parts)
+ buf.append(part);
+
+ return buf.toString().trim();
+ }
+}
[20/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index cf92406..6c86765 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -1,1423 +1,1423 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.CodingErrorAction;
-import java.util.Calendar;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.CharsetUtils;
-import org.xml.sax.SAXException;
-
-/* Tokenizes and performs a "shallow" parse of the RTF
- * document, just enough to properly decode the text.
- *
- * TODO: we should cutover to a "real" tokenizer (eg JFlex);
- * it should give better perf, by replacing the excessive
- * "else if" string compares with FSA traversal. */
-
-final class TextExtractor {
-
- private static final Charset ASCII = Charset.forName("US-ASCII");
- private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
- private static final Charset MAC_ROMAN = getCharset("MacRoman");
- private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
- private static final Charset WINDOWS_57011 = getCharset("windows-57011");
- private static final Charset WINDOWS_57010 = getCharset("windows-57010");
- private static final Charset WINDOWS_57009 = getCharset("windows-57009");
- private static final Charset WINDOWS_57008 = getCharset("windows-57008");
- private static final Charset WINDOWS_57007 = getCharset("windows-57007");
- private static final Charset WINDOWS_57006 = getCharset("windows-57006");
- private static final Charset WINDOWS_57005 = getCharset("windows-57005");
- private static final Charset WINDOWS_57004 = getCharset("windows-57004");
- private static final Charset WINDOWS_57003 = getCharset("windows-57003");
- private static final Charset X_ISCII91 = getCharset("x-ISCII91");
- private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
- private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
- private static final Charset X_JOHAB = getCharset("x-Johab");
- private static final Charset CP12582 = getCharset("CP1258");
- private static final Charset CP12572 = getCharset("CP1257");
- private static final Charset CP12562 = getCharset("CP1256");
- private static final Charset CP12552 = getCharset("CP1255");
- private static final Charset CP12542 = getCharset("CP1254");
- private static final Charset CP12532 = getCharset("CP1253");
- private static final Charset CP1252 = getCharset("CP1252");
- private static final Charset CP12512 = getCharset("CP1251");
- private static final Charset CP12502 = getCharset("CP1250");
- private static final Charset CP950 = getCharset("CP950");
- private static final Charset CP949 = getCharset("CP949");
- private static final Charset MS9362 = getCharset("MS936");
- private static final Charset MS8742 = getCharset("MS874");
- private static final Charset CP866 = getCharset("CP866");
- private static final Charset CP865 = getCharset("CP865");
- private static final Charset CP864 = getCharset("CP864");
- private static final Charset CP863 = getCharset("CP863");
- private static final Charset CP862 = getCharset("CP862");
- private static final Charset CP860 = getCharset("CP860");
- private static final Charset CP852 = getCharset("CP852");
- private static final Charset CP8502 = getCharset("CP850");
- private static final Charset CP819 = getCharset("CP819");
- private static final Charset WINDOWS_720 = getCharset("windows-720");
- private static final Charset WINDOWS_711 = getCharset("windows-711");
- private static final Charset WINDOWS_710 = getCharset("windows-710");
- private static final Charset WINDOWS_709 = getCharset("windows-709");
- private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
- private static final Charset CP4372 = getCharset("CP437");
- private static final Charset CP850 = getCharset("cp850");
- private static final Charset CP437 = getCharset("cp437");
- private static final Charset MS874 = getCharset("ms874");
- private static final Charset CP1257 = getCharset("cp1257");
- private static final Charset CP1256 = getCharset("cp1256");
- private static final Charset CP1255 = getCharset("cp1255");
- private static final Charset CP1258 = getCharset("cp1258");
- private static final Charset CP1254 = getCharset("cp1254");
- private static final Charset CP1253 = getCharset("cp1253");
- private static final Charset MS950 = getCharset("ms950");
- private static final Charset MS936 = getCharset("ms936");
- private static final Charset MS1361 = getCharset("ms1361");
- private static final Charset MS932 = getCharset("MS932");
- private static final Charset CP1251 = getCharset("cp1251");
- private static final Charset CP1250 = getCharset("cp1250");
- private static final Charset MAC_THAI = getCharset("MacThai");
- private static final Charset MAC_TURKISH = getCharset("MacTurkish");
- private static final Charset MAC_GREEK = getCharset("MacGreek");
- private static final Charset MAC_ARABIC = getCharset("MacArabic");
- private static final Charset MAC_HEBREW = getCharset("MacHebrew");
- private static final Charset JOHAB = getCharset("johab");
- private static final Charset BIG5 = getCharset("Big5");
- private static final Charset GB2312 = getCharset("GB2312");
- private static final Charset MS949 = getCharset("ms949");
- // The RTF doc has a "font table" that assigns ords
- // (f0, f1, f2, etc.) to fonts and charsets, using the
- // \fcharsetN control word. This mapping maps from the
- // N to corresponding Java charset:
- private static final Map<Integer, Charset> FCHARSET_MAP =
- new HashMap<Integer, Charset>();
- // The RTF may specify the \ansicpgN charset in the
- // header; this maps the N to the corresponding Java
- // character set:
- private static final Map<Integer, Charset> ANSICPG_MAP =
- new HashMap<Integer, Charset>();
-
- static {
- FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
- // charset 1 is Default
- // charset 2 is Symbol
-
- FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
- FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
- FCHARSET_MAP.put(79, MS949); // Mac Hangul
- FCHARSET_MAP.put(80, GB2312); // Mac GB2312
- FCHARSET_MAP.put(81, BIG5); // Mac Big5
- FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
- FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
- FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
- FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
- FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
- FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
- FCHARSET_MAP.put(88, CP1250); // Mac East Europe
- FCHARSET_MAP.put(89, CP1251); // Mac Russian
-
- FCHARSET_MAP.put(128, MS932); // Shift JIS
- FCHARSET_MAP.put(129, MS949); // Hangul
- FCHARSET_MAP.put(130, MS1361); // Johab
- FCHARSET_MAP.put(134, MS936); // GB2312
- FCHARSET_MAP.put(136, MS950); // Big5
- FCHARSET_MAP.put(161, CP1253); // Greek
- FCHARSET_MAP.put(162, CP1254); // Turkish
- FCHARSET_MAP.put(163, CP1258); // Vietnamese
- FCHARSET_MAP.put(177, CP1255); // Hebrew
- FCHARSET_MAP.put(178, CP1256); // Arabic
- // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
- // FCHARSET_MAP.put( 180, "" ); // Arabic user
- // FCHARSET_MAP.put( 181, "" ); // Hebrew user
- FCHARSET_MAP.put(186, CP1257); // Baltic
-
- FCHARSET_MAP.put(204, CP1251); // Russian
- FCHARSET_MAP.put(222, MS874); // Thai
- FCHARSET_MAP.put(238, CP1250); // Eastern European
- FCHARSET_MAP.put(254, CP437); // PC 437
- FCHARSET_MAP.put(255, CP850); // OEM
- }
-
- static {
- ANSICPG_MAP.put(437, CP4372); // US IBM
- ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
-
- ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
- ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
- ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
- ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
-
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
- ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
- ANSICPG_MAP.put(852, CP852); // Eastern European
- ANSICPG_MAP.put(860, CP860); // Portuguese
- ANSICPG_MAP.put(862, CP862); // Hebrew
- ANSICPG_MAP.put(863, CP863); // French Canadian
- ANSICPG_MAP.put(864, CP864); // Arabic
- ANSICPG_MAP.put(865, CP865); // Norwegian
- ANSICPG_MAP.put(866, CP866); // Soviet Union
- ANSICPG_MAP.put(874, MS8742); // Thai
- ANSICPG_MAP.put(932, MS932); // Japanese
- ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
- ANSICPG_MAP.put(949, CP949); // Korean
- ANSICPG_MAP.put(950, CP950); // Traditional Chinese
- ANSICPG_MAP.put(1250, CP12502); // Eastern European
- ANSICPG_MAP.put(1251, CP12512); // Cyrillic
- ANSICPG_MAP.put(1252, CP1252); // Western European
- ANSICPG_MAP.put(1253, CP12532); // Greek
- ANSICPG_MAP.put(1254, CP12542); // Turkish
- ANSICPG_MAP.put(1255, CP12552); // Hebrew
- ANSICPG_MAP.put(1256, CP12562); // Arabic
- ANSICPG_MAP.put(1257, CP12572); // Baltic
- ANSICPG_MAP.put(1258, CP12582); // Vietnamese
- ANSICPG_MAP.put(1361, X_JOHAB); // Johab
- ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
- ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
- ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
- ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
- ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
- ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
- ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
- ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
- ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
-
- // TODO: in theory these other charsets are simple
- // shifts off of Devanagari, so we could impl that
- // here:
- ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
- ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
- ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
- ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
- ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
- ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
- ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
- ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
- ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
- }
-
- // Used when we decode bytes -> chars using CharsetDecoder:
- private final char[] outputArray = new char[128];
- private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
- // Holds the font table from this RTF doc, mapping
- // the font number (from \fN control word) to the
- // corresponding charset:
- private final Map<Integer, Charset> fontToCharset =
- new HashMap<Integer, Charset>();
- // Group stack: when we open a new group, we push
- // the previous group state onto the stack; when we
- // close the group, we restore it
- private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
- private final StringBuilder pendingBuffer = new StringBuilder();
- private final XHTMLContentHandler out;
- private final Metadata metadata;
- private final RTFEmbObjHandler embObjHandler;
- // How many next ansi chars we should skip; this
- // is 0 except when we are still in the "ansi
- // shadow" after seeing a unicode escape, at which
- // point it's set to the last ucN skip we had seen:
- int ansiSkip = 0;
- private int written = 0;
- // Hold pending bytes (encoded in the current charset)
- // for text output:
- private byte[] pendingBytes = new byte[16];
- private int pendingByteCount;
- private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
- // Holds pending chars for text output
- private char[] pendingChars = new char[10];
- private int pendingCharCount;
- // Holds chars for a still-being-tokenized control word
- private byte[] pendingControl = new byte[10];
- private int pendingControlCount;
- // Reused when possible:
- private CharsetDecoder decoder;
- private Charset lastCharset;
- private Charset globalCharset = WINDOWS_1252;
- private int globalDefaultFont = -1;
- private int curFontID = -1;
- // Current group state; in theory this initial
- // GroupState is unused because the RTF doc should
- // immediately open the top group (start with {):
- private GroupState groupState = new GroupState();
- private boolean inHeader = true;
- private int fontTableState;
- private int fontTableDepth;
- // Non null if we are processing metadata (title,
- // keywords, etc.) inside the info group:
- private Property nextMetaData;
- private boolean inParagraph;
- // Non-zero if we are processing inside a field destination:
- private int fieldState;
- // Non-zero list index
- private int pendingListEnd;
- private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
- private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
- private Map<Integer, ListDescriptor> currentListTable;
- private ListDescriptor currentList;
- private int listTableLevel = -1;
- private boolean ignoreLists;
- // Non-null if we've seen the url for a HYPERLINK but not yet
- // its text:
- private String pendingURL;
- // Used to process the sub-groups inside the upr
- // group:
- private int uprState = -1;
- // Used when extracting CREATION date:
- private int year, month, day, hour, minute;
-
- public TextExtractor(XHTMLContentHandler out, Metadata metadata,
- RTFEmbObjHandler embObjHandler) {
- this.metadata = metadata;
- this.out = out;
- this.embObjHandler = embObjHandler;
- }
-
- private static Charset getCharset(String name) {
- try {
- return CharsetUtils.forName(name);
- } catch (Exception e) {
- return ASCII;
- }
- }
-
- protected static boolean isHexChar(int ch) {
- return (ch >= '0' && ch <= '9') ||
- (ch >= 'a' && ch <= 'f') ||
- (ch >= 'A' && ch <= 'F');
- }
-
- private static boolean isAlpha(int ch) {
- return (ch >= 'a' && ch <= 'z') ||
- (ch >= 'A' && ch <= 'Z');
- }
-
- private static boolean isDigit(int ch) {
- return ch >= '0' && ch <= '9';
- }
-
- protected static int hexValue(int ch) {
- if (ch >= '0' && ch <= '9') {
- return ch - '0';
- } else if (ch >= 'a' && ch <= 'z') {
- return 10 + (ch - 'a');
- } else {
- assert ch >= 'A' && ch <= 'Z';
- return 10 + (ch - 'A');
- }
- }
-
- public boolean isIgnoringLists() {
- return ignoreLists;
- }
-
- public void setIgnoreLists(boolean ignore) {
- this.ignoreLists = ignore;
- }
-
- // Push pending bytes or pending chars:
- private void pushText() throws IOException, SAXException, TikaException {
- if (pendingByteCount != 0) {
- assert pendingCharCount == 0;
- pushBytes();
- } else {
- pushChars();
- }
- }
-
- // Buffers the byte (unit in the current charset) for
- // output:
- private void addOutputByte(int b) throws IOException, SAXException, TikaException {
- assert b >= 0 && b < 256 : "byte value out of range: " + b;
-
- if (pendingCharCount != 0) {
- pushChars();
- }
- if (groupState.pictDepth > 0) {
- embObjHandler.writeMetadataChar((char) b);
- } else {
- // Save the byte in pending buffer:
- if (pendingByteCount == pendingBytes.length) {
- // Gradual but exponential growth:
- final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
- System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
- pendingBytes = newArray;
- pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
- }
- pendingBytes[pendingByteCount++] = (byte) b;
- }
- }
-
- // Buffers a byte as part of a control word:
- private void addControl(int b) {
- assert isAlpha(b);
- // Save the byte in pending buffer:
- if (pendingControlCount == pendingControl.length) {
- // Gradual but exponential growth:
- final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
- System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
- pendingControl = newArray;
- }
- pendingControl[pendingControlCount++] = (byte) b;
- }
-
- // Buffers a UTF16 code unit for output
- private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
- if (pendingByteCount != 0) {
- pushBytes();
- }
-
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(ch);
- } else if (groupState.sn == true || groupState.sv == true) {
- embObjHandler.writeMetadataChar(ch);
- } else {
- if (pendingCharCount == pendingChars.length) {
- // Gradual but exponential growth:
- final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
- System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
- pendingChars = newArray;
- }
- pendingChars[pendingCharCount++] = ch;
- }
- }
-
- // Shallow parses the entire doc, writing output to
- // this.out and this.metadata
- public void extract(InputStream in) throws IOException, SAXException, TikaException {
-// in = new FilterInputStream(in) {
-// public int read() throws IOException {
-// int r = super.read();
-// System.out.write(r);
-// System.out.flush();
-// return r;
-// }
-// public int read(byte b[], int off, int len) throws IOException {
-// int r = super.read(b, off, len);
-// System.out.write(b, off, r);
-// System.out.flush();
-// return r;
-// }
-// };
- extract(new PushbackInputStream(in, 2));
- }
-
- private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
- out.startDocument();
-
- while (true) {
- final int b = in.read();
- if (b == -1) {
- break;
- } else if (b == '\\') {
- parseControlToken(in);
- } else if (b == '{') {
- pushText();
- processGroupStart(in);
- } else if (b == '}') {
- pushText();
- processGroupEnd();
- if (groupStates.isEmpty()) {
- // parsed document closing brace
- break;
- }
- } else if (groupState.objdata == true ||
- groupState.pictDepth == 1) {
- embObjHandler.writeHexChar(b);
- } else if (b != '\r' && b != '\n'
- && (!groupState.ignore || nextMetaData != null ||
- groupState.sn == true || groupState.sv == true)) {
- // Linefeed and carriage return are not
- // significant
- if (ansiSkip != 0) {
- ansiSkip--;
- } else {
- addOutputByte(b);
- }
- }
- }
-
- endParagraph(false);
- out.endDocument();
- }
-
- private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
- int b = in.read();
- if (b == '\'') {
- // escaped hex char
- parseHexChar(in);
- } else if (isAlpha(b)) {
- // control word
- parseControlWord((char) b, in);
- } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
- // escaped char
- addOutputByte(b);
- } else if (b != -1) {
- // control symbol, eg \* or \~
- processControlSymbol((char) b);
- }
- }
-
- private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
- int hex1 = in.read();
- if (!isHexChar(hex1)) {
- // DOC ERROR (malformed hex escape): ignore
- in.unread(hex1);
- return;
- }
-
- int hex2 = in.read();
- if (!isHexChar(hex2)) {
- // TODO: log a warning here, somehow?
- // DOC ERROR (malformed hex escape):
- // ignore
- in.unread(hex2);
- return;
- }
-
- if (ansiSkip != 0) {
- // Skip this ansi char since we are
- // still in the shadow of a unicode
- // escape:
- ansiSkip--;
- } else {
- // Unescape:
- addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
- }
- }
-
- private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
- addControl(firstChar);
-
- int b = in.read();
- while (isAlpha(b)) {
- addControl(b);
- b = in.read();
- }
-
- boolean hasParam = false;
- boolean negParam = false;
- if (b == '-') {
- negParam = true;
- hasParam = true;
- b = in.read();
- }
-
- int param = 0;
- while (isDigit(b)) {
- param *= 10;
- param += (b - '0');
- hasParam = true;
- b = in.read();
- }
-
- // space is consumed as part of the
- // control word, but is not added to the
- // control word
- if (b != ' ') {
- in.unread(b);
- }
-
- if (hasParam) {
- if (negParam) {
- param = -param;
- }
- processControlWord(param, in);
- } else {
- processControlWord();
- }
-
- pendingControlCount = 0;
- }
-
- private void lazyStartParagraph() throws IOException, SAXException, TikaException {
- if (!inParagraph) {
- // Ensure </i></b> order
- if (groupState.italic) {
- end("i");
- }
- if (groupState.bold) {
- end("b");
- }
- if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
- endList(pendingListEnd);
- pendingListEnd = 0;
- }
- if (inList() && pendingListEnd != groupState.list) {
- startList(groupState.list);
- }
- if (inList()) {
- out.startElement("li");
- } else {
- out.startElement("p");
- }
-
- // Ensure <b><i> order
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
- inParagraph = true;
- }
- }
-
- private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
- pushText();
- //maintain consecutive new lines
- if (!inParagraph) {
- lazyStartParagraph();
- }
- if (inParagraph) {
- if (groupState.italic) {
- end("i");
- groupState.italic = preserveStyles;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = preserveStyles;
- }
- if (inList()) {
- out.endElement("li");
- } else {
- out.endElement("p");
- }
-
- if (preserveStyles && (groupState.bold || groupState.italic)) {
- start("p");
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
- inParagraph = true;
- } else {
- inParagraph = false;
- }
- }
-
- // Ensure closing the list at document end
- if (!preserveStyles && pendingListEnd != 0) {
- endList(pendingListEnd);
- pendingListEnd = 0;
- }
- }
-
- // Push pending UTF16 units to out ContentHandler
- private void pushChars() throws IOException, SAXException, TikaException {
- if (pendingCharCount != 0) {
- lazyStartParagraph();
- out.characters(pendingChars, 0, pendingCharCount);
- pendingCharCount = 0;
- }
- }
-
- // Decodes the buffered bytes in pendingBytes
- // into UTF16 code units, and sends the characters
- // to the out ContentHandler, if we are in the body,
- // else appends the characters to the pendingBuffer
- private void pushBytes() throws IOException, SAXException, TikaException {
- if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
-
- final CharsetDecoder decoder = getDecoder();
- pendingByteBuffer.limit(pendingByteCount);
- assert pendingByteBuffer.position() == 0;
- assert outputBuffer.position() == 0;
-
- while (true) {
- // We pass true for endOfInput because, when
- // we are called, we should have seen a
- // complete sequence of characters for this
- // charset:
- final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
-
- final int pos = outputBuffer.position();
- if (pos > 0) {
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(outputArray, 0, pos);
- } else {
- lazyStartParagraph();
- out.characters(outputArray, 0, pos);
- }
- outputBuffer.position(0);
- }
-
- if (result == CoderResult.UNDERFLOW) {
- break;
- }
- }
-
- while (true) {
- final CoderResult result = decoder.flush(outputBuffer);
-
- final int pos = outputBuffer.position();
- if (pos > 0) {
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(outputArray, 0, pos);
- } else {
- lazyStartParagraph();
- out.characters(outputArray, 0, pos);
- }
- outputBuffer.position(0);
- }
-
- if (result == CoderResult.UNDERFLOW) {
- break;
- }
- }
-
- // Reset for next decode
- decoder.reset();
- pendingByteBuffer.position(0);
- }
-
- pendingByteCount = 0;
- }
-
- // NOTE: s must be ascii alpha only
- private boolean equals(String s) {
- if (pendingControlCount != s.length()) {
- return false;
- }
- for (int idx = 0; idx < pendingControlCount; idx++) {
- assert isAlpha(s.charAt(idx));
- if (((byte) s.charAt(idx)) != pendingControl[idx]) {
- return false;
- }
- }
- return true;
- }
-
- private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
- switch (ch) {
- case '~':
- // Non-breaking space -> unicode NON-BREAKING SPACE
- addOutputChar('\u00a0');
- break;
- case '*':
- // Ignorable destination (control words defined after
- // the 1987 RTF spec). These are already handled by
- // processGroupStart()
- break;
- case '-':
- // Optional hyphen -> unicode SOFT HYPHEN
- addOutputChar('\u00ad');
- break;
- case '_':
- // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
- addOutputChar('\u2011');
- break;
- default:
- break;
- }
- }
-
- private CharsetDecoder getDecoder() throws TikaException {
- Charset charset = getCharset();
-
- // Common case: charset is same as last time, so
- // just reuse it:
- if (lastCharset == null || !charset.equals(lastCharset)) {
- decoder = charset.newDecoder();
- decoder.onMalformedInput(CodingErrorAction.REPLACE);
- decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
- lastCharset = charset;
- }
-
- return decoder;
- }
-
- // Return current charset in-use
- private Charset getCharset() throws TikaException {
- // If a specific font (fN) was set, use its charset
- if (groupState.fontCharset != null) {
- return groupState.fontCharset;
- }
-
- // Else, if global default font (defN) was set, use that one
- if (globalDefaultFont != -1 && !inHeader) {
- Charset cs = fontToCharset.get(globalDefaultFont);
- if (cs != null) {
- return cs;
- }
- }
-
- // Else, use the global charset
- if (globalCharset == null) {
- throw new TikaException("unable to determine charset");
- }
-
- return globalCharset;
- }
-
- // Handle control word that takes a parameter:
- private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
- // TODO: afN? (associated font number)
-
- // TODO: do these alter text output...?
- /*
- } else if (equals("stshfdbch")) {
- // font to be used by default in
- // style sheet for East Asian chars
- // arg N is font table entry
- } else if (equals("stshfloch")) {
- // font to be used by default in
- // style sheet for ASCII chars
- // arg N is font table entry
- } else if (equals("stshfhich")) {
- // font to be used by default in
- // style sheet for High Ansi chars
- // arg N is font table entry
- } else if (equals("stshfbi")) {
- // style sheet for Complex Scripts (BIDI) chars
- // arg N is font table entry
- */
-
- // TODO: inefficient that we check equals N times;
- // we'd get better perf w/ real lexer (eg
- // JFlex), which uses single-pass FSM to do cmp:
- if (inHeader) {
- if (equals("ansicpg")) {
- // ANSI codepage
- Charset cs = ANSICPG_MAP.get(param);
- if (cs != null) {
- globalCharset = cs;
- }
- } else if (equals("deff")) {
- // Default font
- globalDefaultFont = param;
- } else if (equals("nofpages")) {
- metadata.add(Office.PAGE_COUNT, Integer.toString(param));
- } else if (equals("nofwords")) {
- metadata.add(Office.WORD_COUNT, Integer.toString(param));
- } else if (equals("nofchars")) {
- metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
- } else if (equals("yr")) {
- year = param;
- } else if (equals("mo")) {
- month = param;
- } else if (equals("dy")) {
- day = param;
- } else if (equals("hr")) {
- hour = param;
- } else if (equals("min")) {
- minute = param;
- }
-
- if (fontTableState == 1) {
- // Still inside font table -- record the
- // mappings of fN to the fcharset:
- if (groupState.depth < fontTableDepth) {
- fontTableState = 2;
- } else {
- if (equals("f")) {
- // Start new font definition
- curFontID = param;
- } else if (equals("fcharset")) {
- Charset cs = FCHARSET_MAP.get(param);
- if (cs != null) {
- fontToCharset.put(curFontID, cs);
- }
- }
- }
- }
-
- if (currentList != null) {
- if (equals("listid")) {
- currentList.id = param;
- currentListTable.put(currentList.id, currentList);
- } else if (equals("listtemplateid")) {
- currentList.templateID = param;
- } else if (equals("levelnfc") || equals("levelnfcn")) {
- //sanity check to make sure list information isn't corrupt
- if (listTableLevel > -1 &&
- listTableLevel < currentList.numberType.length) {
- currentList.numberType[listTableLevel] = param;
- }
- }
- }
- } else {
- // In document
- if (equals("b")) {
- // b0
- assert param == 0;
- if (groupState.bold) {
- pushText();
- if (groupState.italic) {
- end("i");
- }
- end("b");
- if (groupState.italic) {
- start("i");
- }
- groupState.bold = false;
- }
- } else if (equals("i")) {
- // i0
- assert param == 0;
- if (groupState.italic) {
- pushText();
- end("i");
- groupState.italic = false;
- }
- } else if (equals("f")) {
- // Change current font
- Charset fontCharset = fontToCharset.get(param);
-
- // Push any buffered text before changing
- // font:
- pushText();
-
- if (fontCharset != null) {
- groupState.fontCharset = fontCharset;
- } else {
- // DOC ERROR: font change referenced a
- // non-table'd font number
- // TODO: log a warning? Throw an exc?
- groupState.fontCharset = null;
- }
- } else if (equals("ls")) {
- groupState.list = param;
- } else if (equals("lslvl")) {
- groupState.listLevel = param;
- }
- }
-
- // Process unicode escape. This can appear in doc
- // or in header, since the metadata (info) fields
- // in the header can be unicode escaped as well:
- if (equals("u")) {
- // Unicode escape
- if (!groupState.ignore || groupState.sv || groupState.sn) {
- final char utf16CodeUnit = (char) (param & 0xffff);
- addOutputChar(utf16CodeUnit);
- }
-
- // After seeing a unicode escape we must
- // skip the next ucSkip ansi chars (the
- // "unicode shadow")
- ansiSkip = groupState.ucSkip;
- } else if (equals("uc")) {
- // Change unicode shadow length
- groupState.ucSkip = param;
- } else if (equals("bin")) {
- if (param >= 0) {
- if (groupState.pictDepth == 1) {
- try {
- embObjHandler.writeBytes(in, param);
- } catch (IOException e) {
- //param was out of bounds or something went wrong during writing.
- //skip this obj and move on
- //TODO: log.warn
- embObjHandler.reset();
- }
- } else {
- IOUtils.skipFully(in, param);
- }
- } else {
- // log some warning?
- }
- }
- }
-
- private boolean inList() {
- return !ignoreLists && groupState.list != 0;
- }
-
- /**
- * Marks the current list as pending to end. This is done to be able to merge list items of
- * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
- * <code>"ol"</code>).
- */
- private void pendingListEnd() {
- pendingListEnd = groupState.list;
- groupState.list = 0;
- }
-
- /**
- * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
- * type for the given <code>listID</code>.
- *
- * @param listID The ID of the list.
- * @throws IOException
- * @throws SAXException
- * @throws TikaException
- */
- private void endList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
- out.endElement(isUnorderedList(listID) ? "ul" : "ol");
- }
- }
-
- /**
- * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
- * type for the given <code>listID</code>.
- *
- * @param listID The ID of the list.
- * @throws IOException
- * @throws SAXException
- * @throws TikaException
- */
- private void startList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
- out.startElement(isUnorderedList(listID) ? "ul" : "ol");
- }
- }
-
- private boolean isUnorderedList(int listID) {
- ListDescriptor list = listTable.get(listID);
- if (list != null) {
- return list.isUnordered(groupState.listLevel);
- }
- return true;
- }
-
- private void end(String tag) throws IOException, SAXException, TikaException {
- out.endElement(tag);
- }
-
- private void start(String tag) throws IOException, SAXException, TikaException {
- out.startElement(tag);
- }
-
- // Handle non-parameter control word:
- private void processControlWord() throws IOException, SAXException, TikaException {
- if (inHeader) {
- if (equals("ansi")) {
- globalCharset = WINDOWS_1252;
- } else if (equals("pca")) {
- globalCharset = CP850;
- } else if (equals("pc")) {
- globalCharset = CP437;
- } else if (equals("mac")) {
- globalCharset = MAC_ROMAN;
- }
-
- if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
- groupState.ignore = true;
- } else if (equals("listtable")) {
- currentListTable = listTable;
- } else if (equals("listoverridetable")) {
- currentListTable = listOverrideTable;
- }
-
- if (uprState == -1) {
- // TODO: we can also parse \creatim, \revtim,
- // \printim, \version, etc.
- if (equals("author")) {
- nextMetaData = TikaCoreProperties.CREATOR;
- } else if (equals("title")) {
- nextMetaData = TikaCoreProperties.TITLE;
- } else if (equals("subject")) {
- // TODO: Move to OO subject in Tika 2.0
- nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
- } else if (equals("keywords")) {
- nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
- } else if (equals("category")) {
- nextMetaData = OfficeOpenXMLCore.CATEGORY;
- } else if (equals("comment")) {
- nextMetaData = TikaCoreProperties.COMMENTS;
- } else if (equals("company")) {
- nextMetaData = OfficeOpenXMLExtended.COMPANY;
- } else if (equals("manager")) {
- nextMetaData = OfficeOpenXMLExtended.MANAGER;
- } else if (equals("template")) {
- nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
- } else if (equals("creatim")) {
- nextMetaData = TikaCoreProperties.CREATED;
- }
- }
-
- if (fontTableState == 0) {
- // Didn't see font table yet
- if (equals("fonttbl")) {
- fontTableState = 1;
- fontTableDepth = groupState.depth;
- }
- } else if (fontTableState == 1) {
- // Inside font table
- if (groupState.depth < fontTableDepth) {
- fontTableState = 2;
- }
- }
-
- // List table handling
- if (currentListTable != null) {
- if (equals("list") || equals("listoverride")) {
- currentList = new ListDescriptor();
- listTableLevel = -1;
- } else if (currentList != null) {
- if (equals("liststylename")) {
- currentList.isStyle = true;
- } else if (equals("listlevel")) {
- listTableLevel++;
- }
- }
- }
-
- if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
- inHeader = false;
- }
- } else {
- if (equals("b")) {
- if (!groupState.bold) {
- pushText();
- lazyStartParagraph();
- if (groupState.italic) {
- // Make sure nesting is always <b><i>
- end("i");
- }
- groupState.bold = true;
- start("b");
- if (groupState.italic) {
- start("i");
- }
- }
- } else if (equals("i")) {
- if (!groupState.italic) {
- pushText();
- lazyStartParagraph();
- groupState.italic = true;
- start("i");
- }
- }
- }
-
- final boolean ignored = groupState.ignore;
-
- if (equals("pard")) {
- // Reset styles
- pushText();
- if (groupState.italic) {
- end("i");
- groupState.italic = false;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = false;
- }
- if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
- pendingListEnd();
- }
- } else if (equals("par")) {
- if (!ignored) {
- endParagraph(true);
- }
- } else if (equals("shptxt")) {
- pushText();
- // Text inside a shape
- groupState.ignore = false;
- } else if (equals("atnid")) {
- pushText();
- // Annotation ID
- groupState.ignore = false;
- } else if (equals("atnauthor")) {
- pushText();
- // Annotation author
- groupState.ignore = false;
- } else if (equals("annotation")) {
- pushText();
- // Annotation
- groupState.ignore = false;
- } else if (equals("listtext")) {
- groupState.ignore = true;
- } else if (equals("cell")) {
- // TODO: we should produce a table output here?
- //addOutputChar(' ');
- endParagraph(true);
- } else if (equals("sp")) {
- groupState.sp = true;
- } else if (equals("sn")) {
- embObjHandler.startSN();
- groupState.sn = true;
- } else if (equals("sv")) {
- embObjHandler.startSV();
- groupState.sv = true;
- } else if (equals("object")) {
- pushText();
- embObjHandler.setInObject(true);
- groupState.object = true;
- } else if (equals("objdata")) {
- groupState.objdata = true;
- embObjHandler.startObjData();
- } else if (equals("pict")) {
- pushText();
- // TODO: create img tag? but can that support
- // embedded image data?
- groupState.pictDepth = 1;
- embObjHandler.startPict();
- } else if (equals("line")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("column")) {
- if (!ignored) {
- addOutputChar(' ');
- }
- } else if (equals("page")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("softline")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("softcolumn")) {
- if (!ignored) {
- addOutputChar(' ');
- }
- } else if (equals("softpage")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("tab")) {
- if (!ignored) {
- addOutputChar('\t');
- }
- } else if (equals("upr")) {
- uprState = 0;
- } else if (equals("ud") && uprState == 1) {
- uprState = -1;
- // 2nd group inside the upr destination, which
- // contains the unicode encoding of the text, so
- // we want to keep that:
- groupState.ignore = false;
- } else if (equals("bullet")) {
- if (!ignored) {
- // unicode BULLET
- addOutputChar('\u2022');
- }
- } else if (equals("endash")) {
- if (!ignored) {
- // unicode EN DASH
- addOutputChar('\u2013');
- }
- } else if (equals("emdash")) {
- if (!ignored) {
- // unicode EM DASH
- addOutputChar('\u2014');
- }
- } else if (equals("enspace")) {
- if (!ignored) {
- // unicode EN SPACE
- addOutputChar('\u2002');
- }
- } else if (equals("qmspace")) {
- if (!ignored) {
- // quarter em space -> unicode FOUR-PER-EM SPACE
- addOutputChar('\u2005');
- }
- } else if (equals("emspace")) {
- if (!ignored) {
- // unicode EM SPACE
- addOutputChar('\u2003');
- }
- } else if (equals("lquote")) {
- if (!ignored) {
- // unicode LEFT SINGLE QUOTATION MARK
- addOutputChar('\u2018');
- }
- } else if (equals("rquote")) {
- if (!ignored) {
- // unicode RIGHT SINGLE QUOTATION MARK
- addOutputChar('\u2019');
- }
- } else if (equals("ldblquote")) {
- if (!ignored) {
- // unicode LEFT DOUBLE QUOTATION MARK
- addOutputChar('\u201C');
- }
- } else if (equals("rdblquote")) {
- if (!ignored) {
- // unicode RIGHT DOUBLE QUOTATION MARK
- addOutputChar('\u201D');
- }
- } else if (equals("fldinst")) {
- fieldState = 1;
- groupState.ignore = false;
- } else if (equals("fldrslt") && fieldState == 2) {
- assert pendingURL != null;
- lazyStartParagraph();
- out.startElement("a", "href", pendingURL);
- pendingURL = null;
- fieldState = 3;
- groupState.ignore = false;
- }
- }
-
- // Push new GroupState
- private void processGroupStart(PushbackInputStream in) throws IOException {
- ansiSkip = 0;
- // Push current groupState onto the stack
- groupStates.add(groupState);
-
- // Make new GroupState
- groupState = new GroupState(groupState);
- assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth;
-
- if (uprState == 0) {
- uprState = 1;
- groupState.ignore = true;
- }
-
- // Check for ignorable groups. Note that
- // sometimes we un-ignore within this group, eg
- // when handling upr escape.
- int b2 = in.read();
- if (b2 == '\\') {
- int b3 = in.read();
- if (b3 == '*') {
- groupState.ignore = true;
- }
- in.unread(b3);
- }
- in.unread(b2);
- }
-
- // Pop current GroupState
- private void processGroupEnd() throws IOException, SAXException, TikaException {
- if (inHeader) {
- if (nextMetaData != null) {
- if (nextMetaData == TikaCoreProperties.CREATED) {
- Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
- cal.set(year, month - 1, day, hour, minute, 0);
- metadata.set(nextMetaData, cal.getTime());
- } else if (nextMetaData.isMultiValuePermitted()) {
- metadata.add(nextMetaData, pendingBuffer.toString());
- } else {
- metadata.set(nextMetaData, pendingBuffer.toString());
- }
- nextMetaData = null;
- }
- pendingBuffer.setLength(0);
- }
-
- assert groupState.depth > 0;
- ansiSkip = 0;
-
- if (groupState.objdata == true) {
- embObjHandler.handleCompletedObject();
- groupState.objdata = false;
- } else if (groupState.pictDepth > 0) {
- if (groupState.sn == true) {
- embObjHandler.endSN();
- } else if (groupState.sv == true) {
- embObjHandler.endSV();
- } else if (groupState.sp == true) {
- embObjHandler.endSP();
- } else if (groupState.pictDepth == 1) {
- embObjHandler.handleCompletedObject();
- }
- }
-
- if (groupState.object == true) {
- embObjHandler.setInObject(false);
- }
-
- // Be robust if RTF doc is corrupt (has too many
- // closing }s):
- // TODO: log a warning?
- if (groupStates.size() > 0) {
- // Restore group state:
- final GroupState outerGroupState = groupStates.removeLast();
-
- // Close italic, if outer does not have italic or
- // bold changed:
- if (groupState.italic) {
- if (!outerGroupState.italic ||
- groupState.bold != outerGroupState.bold) {
- end("i");
- groupState.italic = false;
- }
- }
-
- // Close bold
- if (groupState.bold && !outerGroupState.bold) {
- end("b");
- }
-
- // Open bold
- if (!groupState.bold && outerGroupState.bold) {
- start("b");
- }
-
- // Open italic
- if (!groupState.italic && outerGroupState.italic) {
- start("i");
- }
- groupState = outerGroupState;
- }
- assert groupStates.size() == groupState.depth;
-
- if (fieldState == 1) {
- String s = pendingBuffer.toString().trim();
- pendingBuffer.setLength(0);
- if (s.startsWith("HYPERLINK")) {
- s = s.substring(9).trim();
- // TODO: what other instructions can be in a
- // HYPERLINK destination?
- final boolean isLocalLink = s.contains("\\l ");
- int idx = s.indexOf('"');
- if (idx != -1) {
- int idx2 = s.indexOf('"', 1 + idx);
- if (idx2 != -1) {
- s = s.substring(1 + idx, idx2);
- }
- }
- pendingURL = (isLocalLink ? "#" : "") + s;
- fieldState = 2;
- } else {
- fieldState = 0;
- }
-
- // TODO: we could process the other known field
- // types. Right now, we will extract their text
- // inlined, but fail to record them in metadata
- // as a field value.
- } else if (fieldState == 3) {
- out.endElement("a");
- fieldState = 0;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+ private static final Charset ASCII = Charset.forName("US-ASCII");
+ private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
+ private static final Charset MAC_ROMAN = getCharset("MacRoman");
+ private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
+ private static final Charset WINDOWS_57011 = getCharset("windows-57011");
+ private static final Charset WINDOWS_57010 = getCharset("windows-57010");
+ private static final Charset WINDOWS_57009 = getCharset("windows-57009");
+ private static final Charset WINDOWS_57008 = getCharset("windows-57008");
+ private static final Charset WINDOWS_57007 = getCharset("windows-57007");
+ private static final Charset WINDOWS_57006 = getCharset("windows-57006");
+ private static final Charset WINDOWS_57005 = getCharset("windows-57005");
+ private static final Charset WINDOWS_57004 = getCharset("windows-57004");
+ private static final Charset WINDOWS_57003 = getCharset("windows-57003");
+ private static final Charset X_ISCII91 = getCharset("x-ISCII91");
+ private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
+ private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
+ private static final Charset X_JOHAB = getCharset("x-Johab");
+ private static final Charset CP12582 = getCharset("CP1258");
+ private static final Charset CP12572 = getCharset("CP1257");
+ private static final Charset CP12562 = getCharset("CP1256");
+ private static final Charset CP12552 = getCharset("CP1255");
+ private static final Charset CP12542 = getCharset("CP1254");
+ private static final Charset CP12532 = getCharset("CP1253");
+ private static final Charset CP1252 = getCharset("CP1252");
+ private static final Charset CP12512 = getCharset("CP1251");
+ private static final Charset CP12502 = getCharset("CP1250");
+ private static final Charset CP950 = getCharset("CP950");
+ private static final Charset CP949 = getCharset("CP949");
+ private static final Charset MS9362 = getCharset("MS936");
+ private static final Charset MS8742 = getCharset("MS874");
+ private static final Charset CP866 = getCharset("CP866");
+ private static final Charset CP865 = getCharset("CP865");
+ private static final Charset CP864 = getCharset("CP864");
+ private static final Charset CP863 = getCharset("CP863");
+ private static final Charset CP862 = getCharset("CP862");
+ private static final Charset CP860 = getCharset("CP860");
+ private static final Charset CP852 = getCharset("CP852");
+ private static final Charset CP8502 = getCharset("CP850");
+ private static final Charset CP819 = getCharset("CP819");
+ private static final Charset WINDOWS_720 = getCharset("windows-720");
+ private static final Charset WINDOWS_711 = getCharset("windows-711");
+ private static final Charset WINDOWS_710 = getCharset("windows-710");
+ private static final Charset WINDOWS_709 = getCharset("windows-709");
+ private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
+ private static final Charset CP4372 = getCharset("CP437");
+ private static final Charset CP850 = getCharset("cp850");
+ private static final Charset CP437 = getCharset("cp437");
+ private static final Charset MS874 = getCharset("ms874");
+ private static final Charset CP1257 = getCharset("cp1257");
+ private static final Charset CP1256 = getCharset("cp1256");
+ private static final Charset CP1255 = getCharset("cp1255");
+ private static final Charset CP1258 = getCharset("cp1258");
+ private static final Charset CP1254 = getCharset("cp1254");
+ private static final Charset CP1253 = getCharset("cp1253");
+ private static final Charset MS950 = getCharset("ms950");
+ private static final Charset MS936 = getCharset("ms936");
+ private static final Charset MS1361 = getCharset("ms1361");
+ private static final Charset MS932 = getCharset("MS932");
+ private static final Charset CP1251 = getCharset("cp1251");
+ private static final Charset CP1250 = getCharset("cp1250");
+ private static final Charset MAC_THAI = getCharset("MacThai");
+ private static final Charset MAC_TURKISH = getCharset("MacTurkish");
+ private static final Charset MAC_GREEK = getCharset("MacGreek");
+ private static final Charset MAC_ARABIC = getCharset("MacArabic");
+ private static final Charset MAC_HEBREW = getCharset("MacHebrew");
+ private static final Charset JOHAB = getCharset("johab");
+ private static final Charset BIG5 = getCharset("Big5");
+ private static final Charset GB2312 = getCharset("GB2312");
+ private static final Charset MS949 = getCharset("ms949");
+ // The RTF doc has a "font table" that assigns ords
+ // (f0, f1, f2, etc.) to fonts and charsets, using the
+ // \fcharsetN control word. This mapping maps from the
+ // N to corresponding Java charset:
+ private static final Map<Integer, Charset> FCHARSET_MAP =
+ new HashMap<Integer, Charset>();
+ // The RTF may specify the \ansicpgN charset in the
+ // header; this maps the N to the corresponding Java
+ // character set:
+ private static final Map<Integer, Charset> ANSICPG_MAP =
+ new HashMap<Integer, Charset>();
+
+ static {
+ FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
+ // charset 1 is Default
+ // charset 2 is Symbol
+
+ FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
+ FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
+ FCHARSET_MAP.put(79, MS949); // Mac Hangul
+ FCHARSET_MAP.put(80, GB2312); // Mac GB2312
+ FCHARSET_MAP.put(81, BIG5); // Mac Big5
+ FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
+ FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
+ FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
+ FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
+ FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
+ FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
+ FCHARSET_MAP.put(88, CP1250); // Mac East Europe
+ FCHARSET_MAP.put(89, CP1251); // Mac Russian
+
+ FCHARSET_MAP.put(128, MS932); // Shift JIS
+ FCHARSET_MAP.put(129, MS949); // Hangul
+ FCHARSET_MAP.put(130, MS1361); // Johab
+ FCHARSET_MAP.put(134, MS936); // GB2312
+ FCHARSET_MAP.put(136, MS950); // Big5
+ FCHARSET_MAP.put(161, CP1253); // Greek
+ FCHARSET_MAP.put(162, CP1254); // Turkish
+ FCHARSET_MAP.put(163, CP1258); // Vietnamese
+ FCHARSET_MAP.put(177, CP1255); // Hebrew
+ FCHARSET_MAP.put(178, CP1256); // Arabic
+ // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+ // FCHARSET_MAP.put( 180, "" ); // Arabic user
+ // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+ FCHARSET_MAP.put(186, CP1257); // Baltic
+
+ FCHARSET_MAP.put(204, CP1251); // Russian
+ FCHARSET_MAP.put(222, MS874); // Thai
+ FCHARSET_MAP.put(238, CP1250); // Eastern European
+ FCHARSET_MAP.put(254, CP437); // PC 437
+ FCHARSET_MAP.put(255, CP850); // OEM
+ }
+
+ static {
+ ANSICPG_MAP.put(437, CP4372); // US IBM
+ ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
+
+ ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
+ ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
+ ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
+ ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
+ ANSICPG_MAP.put(852, CP852); // Eastern European
+ ANSICPG_MAP.put(860, CP860); // Portuguese
+ ANSICPG_MAP.put(862, CP862); // Hebrew
+ ANSICPG_MAP.put(863, CP863); // French Canadian
+ ANSICPG_MAP.put(864, CP864); // Arabic
+ ANSICPG_MAP.put(865, CP865); // Norwegian
+ ANSICPG_MAP.put(866, CP866); // Soviet Union
+ ANSICPG_MAP.put(874, MS8742); // Thai
+ ANSICPG_MAP.put(932, MS932); // Japanese
+ ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
+ ANSICPG_MAP.put(949, CP949); // Korean
+ ANSICPG_MAP.put(950, CP950); // Traditional Chinese
+ ANSICPG_MAP.put(1250, CP12502); // Eastern European
+ ANSICPG_MAP.put(1251, CP12512); // Cyrillic
+ ANSICPG_MAP.put(1252, CP1252); // Western European
+ ANSICPG_MAP.put(1253, CP12532); // Greek
+ ANSICPG_MAP.put(1254, CP12542); // Turkish
+ ANSICPG_MAP.put(1255, CP12552); // Hebrew
+ ANSICPG_MAP.put(1256, CP12562); // Arabic
+ ANSICPG_MAP.put(1257, CP12572); // Baltic
+ ANSICPG_MAP.put(1258, CP12582); // Vietnamese
+ ANSICPG_MAP.put(1361, X_JOHAB); // Johab
+ ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
+ ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
+ ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
+ ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
+ ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
+ ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
+ ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
+ ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
+ ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
+
+ // TODO: in theory these other charsets are simple
+ // shifts off of Devanagari, so we could impl that
+ // here:
+ ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
+ ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
+ ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
+ ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
+ ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
+ ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
+ ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
+ ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
+ ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
+ }
+
+ // Used when we decode bytes -> chars using CharsetDecoder:
+ private final char[] outputArray = new char[128];
+ private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+ // Holds the font table from this RTF doc, mapping
+ // the font number (from \fN control word) to the
+ // corresponding charset:
+ private final Map<Integer, Charset> fontToCharset =
+ new HashMap<Integer, Charset>();
+ // Group stack: when we open a new group, we push
+ // the previous group state onto the stack; when we
+ // close the group, we restore it
+ private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+ private final StringBuilder pendingBuffer = new StringBuilder();
+ private final XHTMLContentHandler out;
+ private final Metadata metadata;
+ private final RTFEmbObjHandler embObjHandler;
+ // How many next ansi chars we should skip; this
+ // is 0 except when we are still in the "ansi
+ // shadow" after seeing a unicode escape, at which
+ // point it's set to the last ucN skip we had seen:
+ int ansiSkip = 0;
+ private int written = 0;
+ // Hold pending bytes (encoded in the current charset)
+ // for text output:
+ private byte[] pendingBytes = new byte[16];
+ private int pendingByteCount;
+ private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+ // Holds pending chars for text output
+ private char[] pendingChars = new char[10];
+ private int pendingCharCount;
+ // Holds chars for a still-being-tokenized control word
+ private byte[] pendingControl = new byte[10];
+ private int pendingControlCount;
+ // Reused when possible:
+ private CharsetDecoder decoder;
+ private Charset lastCharset;
+ private Charset globalCharset = WINDOWS_1252;
+ private int globalDefaultFont = -1;
+ private int curFontID = -1;
+ // Current group state; in theory this initial
+ // GroupState is unused because the RTF doc should
+ // immediately open the top group (start with {):
+ private GroupState groupState = new GroupState();
+ private boolean inHeader = true;
+ private int fontTableState;
+ private int fontTableDepth;
+ // Non null if we are processing metadata (title,
+ // keywords, etc.) inside the info group:
+ private Property nextMetaData;
+ private boolean inParagraph;
+ // Non-zero if we are processing inside a field destination:
+ private int fieldState;
+ // Non-zero list index
+ private int pendingListEnd;
+ private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
+ private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
+ private Map<Integer, ListDescriptor> currentListTable;
+ private ListDescriptor currentList;
+ private int listTableLevel = -1;
+ private boolean ignoreLists;
+ // Non-null if we've seen the url for a HYPERLINK but not yet
+ // its text:
+ private String pendingURL;
+ // Used to process the sub-groups inside the upr
+ // group:
+ private int uprState = -1;
+ // Used when extracting CREATION date:
+ private int year, month, day, hour, minute;
+
+ public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+ RTFEmbObjHandler embObjHandler) {
+ this.metadata = metadata;
+ this.out = out;
+ this.embObjHandler = embObjHandler;
+ }
+
+ private static Charset getCharset(String name) {
+ try {
+ return CharsetUtils.forName(name);
+ } catch (Exception e) {
+ return ASCII;
+ }
+ }
+
+ protected static boolean isHexChar(int ch) {
+ return (ch >= '0' && ch <= '9') ||
+ (ch >= 'a' && ch <= 'f') ||
+ (ch >= 'A' && ch <= 'F');
+ }
+
+ private static boolean isAlpha(int ch) {
+ return (ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z');
+ }
+
+ private static boolean isDigit(int ch) {
+ return ch >= '0' && ch <= '9';
+ }
+
+ protected static int hexValue(int ch) {
+ if (ch >= '0' && ch <= '9') {
+ return ch - '0';
+ } else if (ch >= 'a' && ch <= 'z') {
+ return 10 + (ch - 'a');
+ } else {
+ assert ch >= 'A' && ch <= 'Z';
+ return 10 + (ch - 'A');
+ }
+ }
+
+ public boolean isIgnoringLists() {
+ return ignoreLists;
+ }
+
+ public void setIgnoreLists(boolean ignore) {
+ this.ignoreLists = ignore;
+ }
+
+ // Push pending bytes or pending chars:
+ private void pushText() throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ assert pendingCharCount == 0;
+ pushBytes();
+ } else {
+ pushChars();
+ }
+ }
+
+ // Buffers the byte (unit in the current charset) for
+ // output:
+ private void addOutputByte(int b) throws IOException, SAXException, TikaException {
+ assert b >= 0 && b < 256 : "byte value out of range: " + b;
+
+ if (pendingCharCount != 0) {
+ pushChars();
+ }
+ if (groupState.pictDepth > 0) {
+ embObjHandler.writeMetadataChar((char) b);
+ } else {
+ // Save the byte in pending buffer:
+ if (pendingByteCount == pendingBytes.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
+ System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+ pendingBytes = newArray;
+ pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+ }
+ pendingBytes[pendingByteCount++] = (byte) b;
+ }
+ }
+
+ // Buffers a byte as part of a control word:
+ private void addControl(int b) {
+ assert isAlpha(b);
+ // Save the byte in pending buffer:
+ if (pendingControlCount == pendingControl.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
+ System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+ pendingControl = newArray;
+ }
+ pendingControl[pendingControlCount++] = (byte) b;
+ }
+
+ // Buffers a UTF16 code unit for output
+ private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ pushBytes();
+ }
+
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(ch);
+ } else if (groupState.sn == true || groupState.sv == true) {
+ embObjHandler.writeMetadataChar(ch);
+ } else {
+ if (pendingCharCount == pendingChars.length) {
+ // Gradual but exponential growth:
+ final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
+ System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+ pendingChars = newArray;
+ }
+ pendingChars[pendingCharCount++] = ch;
+ }
+ }
+
+ // Shallow parses the entire doc, writing output to
+ // this.out and this.metadata
+ public void extract(InputStream in) throws IOException, SAXException, TikaException {
+// in = new FilterInputStream(in) {
+// public int read() throws IOException {
+// int r = super.read();
+// System.out.write(r);
+// System.out.flush();
+// return r;
+// }
+// public int read(byte b[], int off, int len) throws IOException {
+// int r = super.read(b, off, len);
+// System.out.write(b, off, r);
+// System.out.flush();
+// return r;
+// }
+// };
+ extract(new PushbackInputStream(in, 2));
+ }
+
+ private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ out.startDocument();
+
+ while (true) {
+ final int b = in.read();
+ if (b == -1) {
+ break;
+ } else if (b == '\\') {
+ parseControlToken(in);
+ } else if (b == '{') {
+ pushText();
+ processGroupStart(in);
+ } else if (b == '}') {
+ pushText();
+ processGroupEnd();
+ if (groupStates.isEmpty()) {
+ // parsed document closing brace
+ break;
+ }
+ } else if (groupState.objdata == true ||
+ groupState.pictDepth == 1) {
+ embObjHandler.writeHexChar(b);
+ } else if (b != '\r' && b != '\n'
+ && (!groupState.ignore || nextMetaData != null ||
+ groupState.sn == true || groupState.sv == true)) {
+ // Linefeed and carriage return are not
+ // significant
+ if (ansiSkip != 0) {
+ ansiSkip--;
+ } else {
+ addOutputByte(b);
+ }
+ }
+ }
+
+ endParagraph(false);
+ out.endDocument();
+ }
+
+ private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ int b = in.read();
+ if (b == '\'') {
+ // escaped hex char
+ parseHexChar(in);
+ } else if (isAlpha(b)) {
+ // control word
+ parseControlWord((char) b, in);
+ } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
+ // escaped char
+ addOutputByte(b);
+ } else if (b != -1) {
+ // control symbol, eg \* or \~
+ processControlSymbol((char) b);
+ }
+ }
+
+ private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ int hex1 = in.read();
+ if (!isHexChar(hex1)) {
+ // DOC ERROR (malformed hex escape): ignore
+ in.unread(hex1);
+ return;
+ }
+
+ int hex2 = in.read();
+ if (!isHexChar(hex2)) {
+ // TODO: log a warning here, somehow?
+ // DOC ERROR (malformed hex escape):
+ // ignore
+ in.unread(hex2);
+ return;
+ }
+
+ if (ansiSkip != 0) {
+ // Skip this ansi char since we are
+ // still in the shadow of a unicode
+ // escape:
+ ansiSkip--;
+ } else {
+ // Unescape:
+ addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
+ }
+ }
+
+ private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
+ addControl(firstChar);
+
+ int b = in.read();
+ while (isAlpha(b)) {
+ addControl(b);
+ b = in.read();
+ }
+
+ boolean hasParam = false;
+ boolean negParam = false;
+ if (b == '-') {
+ negParam = true;
+ hasParam = true;
+ b = in.read();
+ }
+
+ int param = 0;
+ while (isDigit(b)) {
+ param *= 10;
+ param += (b - '0');
+ hasParam = true;
+ b = in.read();
+ }
+
+ // space is consumed as part of the
+ // control word, but is not added to the
+ // control word
+ if (b != ' ') {
+ in.unread(b);
+ }
+
+ if (hasParam) {
+ if (negParam) {
+ param = -param;
+ }
+ processControlWord(param, in);
+ } else {
+ processControlWord();
+ }
+
+ pendingControlCount = 0;
+ }
+
+ private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+ if (!inParagraph) {
+ // Ensure </i></b> order
+ if (groupState.italic) {
+ end("i");
+ }
+ if (groupState.bold) {
+ end("b");
+ }
+ if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
+ endList(pendingListEnd);
+ pendingListEnd = 0;
+ }
+ if (inList() && pendingListEnd != groupState.list) {
+ startList(groupState.list);
+ }
+ if (inList()) {
+ out.startElement("li");
+ } else {
+ out.startElement("p");
+ }
+
+ // Ensure <b><i> order
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ }
+ }
+
+ private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+ pushText();
+ //maintain consecutive new lines
+ if (!inParagraph) {
+ lazyStartParagraph();
+ }
+ if (inParagraph) {
+ if (groupState.italic) {
+ end("i");
+ groupState.italic = preserveStyles;
+ }
+ if (groupState.bold) {
+ end("b");
+ groupState.bold = preserveStyles;
+ }
+ if (inList()) {
+ out.endElement("li");
+ } else {
+ out.endElement("p");
+ }
+
+ if (preserveStyles && (groupState.bold || groupState.italic)) {
+ start("p");
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ } else {
+ inParagraph = false;
+ }
+ }
+
+ // Ensure closing the list at document end
+ if (!preserveStyles && pendingListEnd != 0) {
+ endList(pendingListEnd);
+ pendingListEnd = 0;
+ }
+ }
+
+ // Push pending UTF16 units to out ContentHandler
+ private void pushChars() throws IOException, SAXException, TikaException {
+ if (pendingCharCount != 0) {
+ lazyStartParagraph();
+ out.characters(pendingChars, 0, pendingCharCount);
+ pendingCharCount = 0;
+ }
+ }
+
+ // Decodes the buffered bytes in pendingBytes
+ // into UTF16 code units, and sends the characters
+ // to the out ContentHandler, if we are in the body,
+ // else appends the characters to the pendingBuffer
+ private void pushBytes() throws IOException, SAXException, TikaException {
+ if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+ final CharsetDecoder decoder = getDecoder();
+ pendingByteBuffer.limit(pendingByteCount);
+ assert pendingByteBuffer.position() == 0;
+ assert outputBuffer.position() == 0;
+
+ while (true) {
+ // We pass true for endOfInput because, when
+ // we are called, we should have seen a
+ // complete sequence of characters for this
+ // charset:
+ final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ while (true) {
+ final CoderResult result = decoder.flush(outputBuffer);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ // Reset for next decode
+ decoder.reset();
+ pendingByteBuffer.position(0);
+ }
+
+ pendingByteCount = 0;
+ }
+
+ // NOTE: s must be ascii alpha only
+ private boolean equals(String s) {
+ if (pendingControlCount != s.length()) {
+ return false;
+ }
+ for (int idx = 0; idx < pendingControlCount; idx++) {
+ assert isAlpha(s.charAt(idx));
+ if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+ switch (ch) {
+ case '~':
+ // Non-breaking space -> unicode NON-BREAKING SPACE
+ addOutputChar('\u00a0');
+ break;
+ case '*':
+ // Ignorable destination (control words defined after
+ // the 1987 RTF spec). These are already handled by
+ // processGroupStart()
+ break;
+ case '-':
+ // Optional hyphen -> unicode SOFT HYPHEN
+ addOutputChar('\u00ad');
+ break;
+ case '_':
+ // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+ addOutputChar('\u2011');
+ break;
+ default:
+ break;
+ }
+ }
+
+ private CharsetDecoder getDecoder() throws TikaException {
+ Charset charset = getCharset();
+
+ // Common case: charset is same as last time, so
+ // just reuse it:
+ if (lastCharset == null || !charset.equals(lastCharset)) {
+ decoder = charset.newDecoder();
+ decoder.onMalformedInput(CodingErrorAction.REPLACE);
+ decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ lastCharset = charset;
+ }
+
+ return decoder;
+ }
+
+ // Return current charset in-use
+ private Charset getCharset() throws TikaException {
+ // If a specific font (fN) was set, use its charset
+ if (groupState.fontCharset != null) {
+ return groupState.fontCharset;
+ }
+
+ // Else, if global default font (defN) was set, use that one
+ if (globalDefaultFont != -1 && !inHeader) {
+ Charset cs = fontToCharset.get(globalDefaultFont);
+ if (cs != null) {
+ return cs;
+ }
+ }
+
+ /
<TRUNCATED>
[13/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
index 6336258..c60f955 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
@@ -1,174 +1,174 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-class KeynoteContentHandler extends DefaultHandler {
-
- public final static String PRESENTATION_WIDTH = "slides-width";
- public final static String PRESENTATION_HEIGHT = "slides-height";
-
- private final XHTMLContentHandler xhtml;
- private final Metadata metadata;
-
- private boolean inSlide = false;
- private boolean inTheme = false;
- private boolean inTitle = false;
- private boolean inBody = false;
- private String tableId;
- private Integer numberOfColumns = null;
- private Integer currentColumn = null;
-
- private boolean inMetadata = false;
- private boolean inMetaDataTitle = false;
- private boolean inMetaDataAuthors = false;
-
- private boolean inParsableText = false;
-
- private int numberOfSlides = 0;
-
- KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
- this.xhtml = xhtml;
- this.metadata = metadata;
- }
-
- @Override
- public void endDocument() throws SAXException {
- metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
- }
-
- @Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
- throws SAXException {
- if ("key:theme".equals(qName)) {
- inTheme = true;
- } else if ("key:slide".equals(qName)) {
- inSlide = true;
- numberOfSlides++;
- xhtml.startElement("div");
- } else if ("key:master-slide".equals(qName)) {
- inSlide = true;
- xhtml.startElement("div");
- } else if ("key:title-placeholder".equals(qName) && inSlide) {
- inTitle = true;
- xhtml.startElement("h1");
- } else if ("sf:sticky-note".equals(qName) && inSlide) {
- xhtml.startElement("p");
- } else if ("key:notes".equals(qName) && inSlide) {
- xhtml.startElement("p");
- } else if ("key:body-placeholder".equals(qName) && inSlide) {
- xhtml.startElement("p");
- inBody = true;
- } else if ("key:size".equals(qName) && !inTheme) {
- String width = attributes.getValue("sfa:w");
- String height = attributes.getValue("sfa:h");
- metadata.set(PRESENTATION_WIDTH, width);
- metadata.set(PRESENTATION_HEIGHT, height);
- } else if ("sf:text-body".equals(qName)) {
- inParsableText = true;
- } else if ("key:metadata".equals(qName)) {
- inMetadata = true;
- } else if (inMetadata && "key:title".equals(qName)) {
- inMetaDataTitle = true;
- } else if (inMetadata && "key:authors".equals(qName)) {
- inMetaDataAuthors = true;
- } else if (inMetaDataTitle && "key:string".equals(qName)) {
- metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
- } else if (inMetaDataAuthors && "key:string".equals(qName)) {
- metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
- } else if (inSlide && "sf:tabular-model".equals(qName)) {
- tableId = attributes.getValue("sfa:ID");
- xhtml.startElement("table");
- } else if (tableId != null && "sf:columns".equals(qName)) {
- numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
- currentColumn = 0;
- } else if (tableId != null && "sf:ct".equals(qName)) {
- parseTableData(attributes.getValue("sfa:s"));
- } else if (tableId != null && "sf:n".equals(qName)) {
- parseTableData(attributes.getValue("sf:v"));
- } else if ("sf:p".equals(qName)) {
- xhtml.startElement("p");
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
- if ("key:theme".equals(qName)) {
- inTheme = false;
- } else if ("key:slide".equals(qName)) {
- inSlide = false;
- xhtml.endElement("div");
- } else if ("key:master-slide".equals(qName)) {
- inSlide = false;
- xhtml.endElement("div");
- } else if ("key:title-placeholder".equals(qName) && inSlide) {
- inTitle = false;
- xhtml.endElement("h1");
- } else if ("sf:sticky-note".equals(qName) && inSlide) {
- xhtml.endElement("p");
- } else if ("key:notes".equals(qName) && inSlide) {
- xhtml.endElement("p");
- } else if ("key:body-placeholder".equals(qName) && inSlide) {
- xhtml.endElement("p");
- inBody = false;
- } else if ("sf:text-body".equals(qName)) {
- inParsableText = false;
- } else if ("key:metadata".equals(qName)) {
- inMetadata = false;
- } else if (inMetadata && "key:title".equals(qName)) {
- inMetaDataTitle = false;
- } else if (inMetadata && "key:authors".equals(qName)) {
- inMetaDataAuthors = false;
- } else if (inSlide && "sf:tabular-model".equals(qName)) {
- xhtml.endElement("table");
- tableId = null;
- numberOfColumns = null;
- currentColumn = null;
- } else if ("sf:p".equals(qName)) {
- xhtml.endElement("p");
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- if (inParsableText && inSlide && length != 0) {
- xhtml.characters(ch, start, length);
- }
- }
-
- private void parseTableData(String value) throws SAXException {
- if (currentColumn == 0) {
- xhtml.startElement("tr");
- }
-
- xhtml.element("td", value);
-
- if (currentColumn.equals(numberOfColumns)) {
- xhtml.endElement("tr");
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class KeynoteContentHandler extends DefaultHandler {
+
+ public final static String PRESENTATION_WIDTH = "slides-width";
+ public final static String PRESENTATION_HEIGHT = "slides-height";
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ private boolean inSlide = false;
+ private boolean inTheme = false;
+ private boolean inTitle = false;
+ private boolean inBody = false;
+ private String tableId;
+ private Integer numberOfColumns = null;
+ private Integer currentColumn = null;
+
+ private boolean inMetadata = false;
+ private boolean inMetaDataTitle = false;
+ private boolean inMetaDataAuthors = false;
+
+ private boolean inParsableText = false;
+
+ private int numberOfSlides = 0;
+
+ KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if ("key:theme".equals(qName)) {
+ inTheme = true;
+ } else if ("key:slide".equals(qName)) {
+ inSlide = true;
+ numberOfSlides++;
+ xhtml.startElement("div");
+ } else if ("key:master-slide".equals(qName)) {
+ inSlide = true;
+ xhtml.startElement("div");
+ } else if ("key:title-placeholder".equals(qName) && inSlide) {
+ inTitle = true;
+ xhtml.startElement("h1");
+ } else if ("sf:sticky-note".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ } else if ("key:notes".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ } else if ("key:body-placeholder".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ inBody = true;
+ } else if ("key:size".equals(qName) && !inTheme) {
+ String width = attributes.getValue("sfa:w");
+ String height = attributes.getValue("sfa:h");
+ metadata.set(PRESENTATION_WIDTH, width);
+ metadata.set(PRESENTATION_HEIGHT, height);
+ } else if ("sf:text-body".equals(qName)) {
+ inParsableText = true;
+ } else if ("key:metadata".equals(qName)) {
+ inMetadata = true;
+ } else if (inMetadata && "key:title".equals(qName)) {
+ inMetaDataTitle = true;
+ } else if (inMetadata && "key:authors".equals(qName)) {
+ inMetaDataAuthors = true;
+ } else if (inMetaDataTitle && "key:string".equals(qName)) {
+ metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
+ } else if (inMetaDataAuthors && "key:string".equals(qName)) {
+ metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
+ } else if (inSlide && "sf:tabular-model".equals(qName)) {
+ tableId = attributes.getValue("sfa:ID");
+ xhtml.startElement("table");
+ } else if (tableId != null && "sf:columns".equals(qName)) {
+ numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
+ currentColumn = 0;
+ } else if (tableId != null && "sf:ct".equals(qName)) {
+ parseTableData(attributes.getValue("sfa:s"));
+ } else if (tableId != null && "sf:n".equals(qName)) {
+ parseTableData(attributes.getValue("sf:v"));
+ } else if ("sf:p".equals(qName)) {
+ xhtml.startElement("p");
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if ("key:theme".equals(qName)) {
+ inTheme = false;
+ } else if ("key:slide".equals(qName)) {
+ inSlide = false;
+ xhtml.endElement("div");
+ } else if ("key:master-slide".equals(qName)) {
+ inSlide = false;
+ xhtml.endElement("div");
+ } else if ("key:title-placeholder".equals(qName) && inSlide) {
+ inTitle = false;
+ xhtml.endElement("h1");
+ } else if ("sf:sticky-note".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ } else if ("key:notes".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ } else if ("key:body-placeholder".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ inBody = false;
+ } else if ("sf:text-body".equals(qName)) {
+ inParsableText = false;
+ } else if ("key:metadata".equals(qName)) {
+ inMetadata = false;
+ } else if (inMetadata && "key:title".equals(qName)) {
+ inMetaDataTitle = false;
+ } else if (inMetadata && "key:authors".equals(qName)) {
+ inMetaDataAuthors = false;
+ } else if (inSlide && "sf:tabular-model".equals(qName)) {
+ xhtml.endElement("table");
+ tableId = null;
+ numberOfColumns = null;
+ currentColumn = null;
+ } else if ("sf:p".equals(qName)) {
+ xhtml.endElement("p");
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (inParsableText && inSlide && length != 0) {
+ xhtml.characters(ch, start, length);
+ }
+ }
+
+ private void parseTableData(String value) throws SAXException {
+ if (currentColumn == 0) {
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", value);
+
+ if (currentColumn.equals(numberOfColumns)) {
+ xhtml.endElement("tr");
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
index 5dc57ae..0d3dfd1 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
@@ -1,231 +1,231 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.HashMap;
-import java.util.Map;
-
-class NumbersContentHandler extends DefaultHandler {
-
- private final XHTMLContentHandler xhtml;
- private final Metadata metadata;
-
- private boolean inSheet = false;
-
- private boolean inText = false;
- private boolean parseText = false;
-
- private boolean inMetadata = false;
- private Property metadataKey;
- private String metadataPropertyQName;
-
- private boolean inTable = false;
- private int numberOfSheets = 0;
- private int numberOfColumns = -1;
- private int currentColumn = 0;
-
- private Map<String, String> menuItems = new HashMap<String, String>();
- private String currentMenuItemId;
-
- NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
- this.xhtml = xhtml;
- this.metadata = metadata;
- }
-
- @Override
- public void endDocument() throws SAXException {
- metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
- if ("ls:workspace".equals(qName)) {
- inSheet = true;
- numberOfSheets++;
- xhtml.startElement("div");
- String sheetName = attributes.getValue("ls:workspace-name");
- metadata.add("sheetNames", sheetName);
- }
-
- if ("sf:text".equals(qName)) {
- inText = true;
- xhtml.startElement("p");
- }
-
- if ("sf:p".equals(qName)) {
- parseText = true;
- }
-
- if ("sf:metadata".equals(qName)) {
- inMetadata = true;
- return;
- }
-
- if (inMetadata && metadataKey == null) {
- metadataKey = resolveMetadataKey(localName);
- metadataPropertyQName = qName;
- }
-
- if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
- metadata.add(metadataKey, attributes.getValue("sfa:string"));
- }
-
- if (!inSheet) {
- return;
- }
-
- if ("sf:tabular-model".equals(qName)) {
- String tableName = attributes.getValue("sf:name");
- xhtml.startElement("div");
- xhtml.characters(tableName);
- xhtml.endElement("div");
- inTable = true;
- xhtml.startElement("table");
- xhtml.startElement("tr");
- currentColumn = 0;
- }
-
- if ("sf:menu-choices".equals(qName)) {
- menuItems = new HashMap<String, String>();
- }
-
- if (inTable && "sf:grid".equals(qName)) {
- numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
- }
-
- if (menuItems != null && "sf:t".equals(qName)) {
- currentMenuItemId = attributes.getValue("sfa:ID");
- }
-
- if (currentMenuItemId != null && "sf:ct".equals(qName)) {
- menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
- }
-
- if (inTable && "sf:ct".equals(qName)) {
- if (currentColumn >= numberOfColumns) {
- currentColumn = 0;
- xhtml.endElement("tr");
- xhtml.startElement("tr");
- }
-
- xhtml.element("td", attributes.getValue("sfa:s"));
- currentColumn++;
- }
-
- if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
- if (currentColumn >= numberOfColumns) {
- currentColumn = 0;
- xhtml.endElement("tr");
- xhtml.startElement("tr");
- }
-
- xhtml.element("td", attributes.getValue("sf:v"));
- currentColumn++;
- }
-
- if (inTable && "sf:proxied-cell-ref".equals(qName)) {
- if (currentColumn >= numberOfColumns) {
- currentColumn = 0;
- xhtml.endElement("tr");
- xhtml.startElement("tr");
- }
-
- xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
- currentColumn++;
- }
-
- if ("sf:chart-name".equals(qName)) {
- // Extract chart name:
- xhtml.startElement("div", "class", "chart");
- xhtml.startElement("h1");
- xhtml.characters(attributes.getValue("sfa:string"));
- xhtml.endElement("h1");
- xhtml.endElement("div");
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (parseText && length > 0) {
- xhtml.characters(ch, start, length);
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if ("ls:workspace".equals(qName)) {
- inSheet = false;
- xhtml.endElement("div");
- }
-
- if ("sf:text".equals(qName)) {
- inText = false;
- xhtml.endElement("p");
- }
-
- if ("sf:p".equals(qName)) {
- parseText = false;
- }
-
- if ("sf:metadata".equals(qName)) {
- inMetadata = false;
- }
-
- if (inMetadata && qName.equals(metadataPropertyQName)) {
- metadataPropertyQName = null;
- metadataKey = null;
- }
-
- if (!inSheet) {
- return;
- }
-
- if ("sf:menu-choices".equals(qName)) {
- }
-
- if ("sf:tabular-model".equals(qName)) {
- inTable = false;
- xhtml.endElement("tr");
- xhtml.endElement("table");
- }
-
- if (currentMenuItemId != null && "sf:t".equals(qName)) {
- currentMenuItemId = null;
- }
- }
-
- private Property resolveMetadataKey(String localName) {
- if ("authors".equals(localName)) {
- return TikaCoreProperties.CREATOR;
- }
- if ("title".equals(localName)) {
- return TikaCoreProperties.TITLE;
- }
- if ("comment".equals(localName)) {
- return TikaCoreProperties.COMMENTS;
- }
- return Property.internalText(localName);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class NumbersContentHandler extends DefaultHandler {
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ private boolean inSheet = false;
+
+ private boolean inText = false;
+ private boolean parseText = false;
+
+ private boolean inMetadata = false;
+ private Property metadataKey;
+ private String metadataPropertyQName;
+
+ private boolean inTable = false;
+ private int numberOfSheets = 0;
+ private int numberOfColumns = -1;
+ private int currentColumn = 0;
+
+ private Map<String, String> menuItems = new HashMap<String, String>();
+ private String currentMenuItemId;
+
+ NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ if ("ls:workspace".equals(qName)) {
+ inSheet = true;
+ numberOfSheets++;
+ xhtml.startElement("div");
+ String sheetName = attributes.getValue("ls:workspace-name");
+ metadata.add("sheetNames", sheetName);
+ }
+
+ if ("sf:text".equals(qName)) {
+ inText = true;
+ xhtml.startElement("p");
+ }
+
+ if ("sf:p".equals(qName)) {
+ parseText = true;
+ }
+
+ if ("sf:metadata".equals(qName)) {
+ inMetadata = true;
+ return;
+ }
+
+ if (inMetadata && metadataKey == null) {
+ metadataKey = resolveMetadataKey(localName);
+ metadataPropertyQName = qName;
+ }
+
+ if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
+ metadata.add(metadataKey, attributes.getValue("sfa:string"));
+ }
+
+ if (!inSheet) {
+ return;
+ }
+
+ if ("sf:tabular-model".equals(qName)) {
+ String tableName = attributes.getValue("sf:name");
+ xhtml.startElement("div");
+ xhtml.characters(tableName);
+ xhtml.endElement("div");
+ inTable = true;
+ xhtml.startElement("table");
+ xhtml.startElement("tr");
+ currentColumn = 0;
+ }
+
+ if ("sf:menu-choices".equals(qName)) {
+ menuItems = new HashMap<String, String>();
+ }
+
+ if (inTable && "sf:grid".equals(qName)) {
+ numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
+ }
+
+ if (menuItems != null && "sf:t".equals(qName)) {
+ currentMenuItemId = attributes.getValue("sfa:ID");
+ }
+
+ if (currentMenuItemId != null && "sf:ct".equals(qName)) {
+ menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
+ }
+
+ if (inTable && "sf:ct".equals(qName)) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", attributes.getValue("sfa:s"));
+ currentColumn++;
+ }
+
+ if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", attributes.getValue("sf:v"));
+ currentColumn++;
+ }
+
+ if (inTable && "sf:proxied-cell-ref".equals(qName)) {
+ if (currentColumn >= numberOfColumns) {
+ currentColumn = 0;
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
+ currentColumn++;
+ }
+
+ if ("sf:chart-name".equals(qName)) {
+ // Extract chart name:
+ xhtml.startElement("div", "class", "chart");
+ xhtml.startElement("h1");
+ xhtml.characters(attributes.getValue("sfa:string"));
+ xhtml.endElement("h1");
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (parseText && length > 0) {
+ xhtml.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if ("ls:workspace".equals(qName)) {
+ inSheet = false;
+ xhtml.endElement("div");
+ }
+
+ if ("sf:text".equals(qName)) {
+ inText = false;
+ xhtml.endElement("p");
+ }
+
+ if ("sf:p".equals(qName)) {
+ parseText = false;
+ }
+
+ if ("sf:metadata".equals(qName)) {
+ inMetadata = false;
+ }
+
+ if (inMetadata && qName.equals(metadataPropertyQName)) {
+ metadataPropertyQName = null;
+ metadataKey = null;
+ }
+
+ if (!inSheet) {
+ return;
+ }
+
+ if ("sf:menu-choices".equals(qName)) {
+ }
+
+ if ("sf:tabular-model".equals(qName)) {
+ inTable = false;
+ xhtml.endElement("tr");
+ xhtml.endElement("table");
+ }
+
+ if (currentMenuItemId != null && "sf:t".equals(qName)) {
+ currentMenuItemId = null;
+ }
+ }
+
+ private Property resolveMetadataKey(String localName) {
+ if ("authors".equals(localName)) {
+ return TikaCoreProperties.CREATOR;
+ }
+ if ("title".equals(localName)) {
+ return TikaCoreProperties.TITLE;
+ }
+ if ("comment".equals(localName)) {
+ return TikaCoreProperties.COMMENTS;
+ }
+ return Property.internalText(localName);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
index b09b36f..9b45769 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
@@ -1,448 +1,448 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-class PagesContentHandler extends DefaultHandler {
-
- private final XHTMLContentHandler xhtml;
- private final Metadata metadata;
-
- /** The (interesting) part of the document we're in. Should be more structured... */
- private enum DocumentPart {
- METADATA, PARSABLE_TEXT,
- HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
- FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
- FOOTNOTES, ANNOTATIONS;
- }
- private DocumentPart inPart = null;
- private boolean ghostText;
-
- private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-
- private boolean parseProperty = false;
- private int pageCount = 0;
- private int slPageCount = 0;
-
- private HeaderFooter headers = null;
- private HeaderFooter footers = null;
- private Footnotes footnotes = null;
- private Annotations annotations = null;
-
- private Map<String, List<List<String>>> tableData =
- new HashMap<String, List<List<String>>>();
- private String activeTableId;
- private int numberOfColumns = 0;
- private List<String> activeRow = new ArrayList<String>();
-
- private String metaDataLocalName;
- private String metaDataQName;
-
- PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
- this.xhtml = xhtml;
- this.metadata = metadata;
- }
-
- @Override
- public void endDocument() throws SAXException {
- metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
- if (pageCount > 0) {
- doFooter();
- xhtml.endElement("div");
- }
- }
-
- @Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
- throws SAXException {
- if (parseProperty) {
- String value = parsePrimitiveElementValue(qName, attributes);
- if (value != null) {
- Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
- if(metaDataKey instanceof Property) {
- metadata.set((Property)metaDataKey, value);
- } else {
- metadata.add((String)metaDataKey, value);
- }
- }
- }
-
- if ("sl:publication-info".equals(qName)) {
- inPart = DocumentPart.METADATA;
- } else if ("sf:metadata".equals(qName)) {
- inPart = DocumentPart.METADATA;
- } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
- if (pageCount > 0) {
- doFooter();
- xhtml.endElement("div");
- }
- xhtml.startElement("div");
- if ("sl:page-group".equals(qName)) {
- slPageCount++;
- } else {
- pageCount++;
- }
- doHeader();
- } else if ("sf:p".equals(qName)) {
- if (pageCount+slPageCount > 0) {
- inPart = DocumentPart.PARSABLE_TEXT;
- xhtml.startElement("p");
- }
- } else if ("sf:attachment".equals(qName)) {
- String kind = attributes.getValue("sf:kind");
- if ("tabular-attachment".equals(kind)) {
- activeTableId = attributes.getValue("sfa:ID");
- tableData.put(activeTableId, new ArrayList<List<String>>());
- }
- } else if ("sf:attachment-ref".equals(qName)) {
- String idRef = attributes.getValue("sfa:IDREF");
- outputTable(idRef);
- } else if ("sf:headers".equals(qName)) {
- headers = new HeaderFooter(qName);
- inPart = DocumentPart.HEADERS;
- } else if ("sf:footers".equals(qName)) {
- footers = new HeaderFooter(qName);
- inPart = DocumentPart.FOOTERS;
- } else if ("sf:header".equals(qName)) {
- inPart = headers.identifyPart(attributes.getValue("sf:name"));
- } else if ("sf:footer".equals(qName)) {
- inPart = footers.identifyPart(attributes.getValue("sf:name"));
- } else if ("sf:page-number".equals(qName)) {
- if (inPart == DocumentPart.FOOTER_ODD
- || inPart == DocumentPart.FOOTER_FIRST
- || inPart == DocumentPart.FOOTER_EVEN) {
- // We are in a footer
- footers.hasAutoPageNumber = true;
- footers.autoPageNumberFormat = attributes.getValue("sf:format");
- } else {
- headers.hasAutoPageNumber = true;
- headers.autoPageNumberFormat = attributes.getValue("sf:format");
- }
-
- xhtml.characters(Integer.toString(this.pageCount));
- } else if ("sf:footnotes".equals(qName)) {
- footnotes = new Footnotes();
- inPart = DocumentPart.FOOTNOTES;
- } else if ("sf:footnote-mark".equals(qName)) {
- footnotes.recordMark(attributes.getValue("sf:mark"));
- } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- // What about non auto-numbered?
- String footnoteMark = attributes.getValue("sf:autonumber");
- if (footnotes != null) {
- String footnoteText = footnotes.footnotes.get(footnoteMark);
- if (footnoteText != null) {
- xhtml.startElement("div", "style", "footnote");
- xhtml.characters("Footnote:" ); // As shown in Pages
- xhtml.characters(footnoteText);
- xhtml.endElement("div");
- }
- }
- } else if ("sf:annotations".equals(qName)) {
- annotations = new Annotations();
- inPart = DocumentPart.ANNOTATIONS;
- } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
- annotations.start(attributes.getValue("sf:target"));
- } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- xhtml.startElement("div", "style", "annotated");
-
- String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
- if (annotationText != null) {
- xhtml.startElement("div", "style", "annotation");
- xhtml.characters(annotationText);
- xhtml.endElement("div");
- }
- } else if ("sf:ghost-text".equals(qName)) {
- ghostText = true;
- }
-
- if (activeTableId != null) {
- parseTableData(qName, attributes);
- }
-
- if (inPart == DocumentPart.METADATA) {
- metaDataLocalName = localName;
- metaDataQName = qName;
- parseProperty = true;
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
- if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
- metaDataLocalName = null;
- parseProperty = false;
- }
-
- if ("sl:publication-info".equals(qName)) {
- inPart = null;
- } else if ("sf:metadata".equals(qName)) {
- inPart = null;
- } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
- inPart = null;
- xhtml.endElement("p");
- } else if ("sf:attachment".equals(qName)) {
- activeTableId = null;
- } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
- annotations.end();
- } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- xhtml.endElement("div");
- } else if ("sf:ghost-text".equals(qName)) {
- ghostText = false;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (length > 0) {
- if (inPart == DocumentPart.PARSABLE_TEXT) {
- if (!ghostText) {
- xhtml.characters(ch, start, length);
- }
- } else if(inPart != null) {
- String str = new String(ch, start, length);
- if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
- if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
- if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
- if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
- if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
- if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
- if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
- if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
- }
- }
- }
-
- private void parseTableData(String qName, Attributes attributes) {
- if ("sf:grid".equals(qName)) {
- String numberOfColumns = attributes.getValue("sf:numcols");
- this.numberOfColumns = Integer.parseInt(numberOfColumns);
- } else if ("sf:ct".equals(qName)) {
- activeRow.add(attributes.getValue("sfa:s"));
-
- if (activeRow.size() >= 3) {
- tableData.get(activeTableId).add(activeRow);
- activeRow = new ArrayList<String>();
- }
- }
- }
-
- private void outputTable(String idRef) throws SAXException {
- List<List<String>> tableData = this.tableData.get(idRef);
- if (tableData != null) {
- xhtml.startElement("table");
- for (List<String> row : tableData) {
- xhtml.startElement("tr");
- for (String cell : row) {
- xhtml.element("td", cell);
- }
- xhtml.endElement("tr");
- }
- xhtml.endElement("table");
- }
- }
-
- /**
- * Returns a resolved key that is common in other document types or
- * returns the specified metaDataLocalName if no common key could be found.
- * The key could be a simple String key, or could be a {@link Property}
- *
- * @param metaDataLocalName The localname of the element containing metadata
- * @return a resolved key that is common in other document types
- */
- private Object resolveMetaDataKey(String metaDataLocalName) {
- Object metaDataKey = metaDataLocalName;
- if ("sf:authors".equals(metaDataQName)) {
- metaDataKey = TikaCoreProperties.CREATOR;
- } else if ("sf:title".equals(metaDataQName)) {
- metaDataKey = TikaCoreProperties.TITLE;
- } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
- metaDataKey = TikaCoreProperties.CREATED;
- } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
- metaDataKey = Metadata.LAST_MODIFIED;
- } else if ("sl:language".equals(metaDataQName)) {
- metaDataKey = TikaCoreProperties.LANGUAGE;
- }
- return metaDataKey;
- }
-
- /**
- * Returns the value of a primitive element e.g.:
- * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute
- * <sl:string sfa:string="en"/> = the string attribute
- * <p>
- * Returns <code>null</code> if the value could not be extracted from
- * the list of attributes.
- *
- * @param qName The fully qualified name of the element containing
- * the value to extract
- * @param attributes The list of attributes of which one contains the
- * value to be extracted
- * @return the value of a primitive element
- */
- private String parsePrimitiveElementValue(
- String qName, Attributes attributes) {
- if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
- return attributes.getValue("sfa:string");
- } else if ("sl:number".equals(qName)) {
- return attributes.getValue("sfa:number");
- } else if ("sl:date".equals(qName)) {
- return attributes.getValue("sf:val");
- }
-
- return null;
- }
-
- private void doHeader() throws SAXException {
- if (headers != null) {
- headers.output("header");
- }
- }
- private void doFooter() throws SAXException {
- if (footers != null) {
- footers.output("footer");
- }
- }
-
- /**
- * Represents the Headers or Footers in a document
- */
- private class HeaderFooter {
- private String type; // sf:headers or sf:footers
- private String defaultOdd;
- private String defaultEven;
- private String defaultFirst;
- private boolean hasAutoPageNumber;
- private String autoPageNumberFormat;
- // TODO Can there be custom ones?
-
- private HeaderFooter(String type) {
- this.type = type;
- }
- private DocumentPart identifyPart(String name) {
- if("SFWPDefaultOddHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_ODD;
- if("SFWPDefaultEvenHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_EVEN;
- if("SFWPDefaultFirstHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_FIRST;
-
- if("SFWPDefaultOddFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_ODD;
- if("SFWPDefaultEvenFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_EVEN;
- if("SFWPDefaultFirstFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_FIRST;
-
- return null;
- }
- private void output(String what) throws SAXException {
- String text = null;
- if (pageCount == 1 && defaultFirst != null) {
- text = defaultFirst;
- } else if (pageCount % 2 == 0 && defaultEven != null) {
- text = defaultEven;
- } else {
- text = defaultOdd;
- }
-
- if (text != null) {
- xhtml.startElement("div", "class", "header");
- xhtml.characters(text);
- if (hasAutoPageNumber) {
- if (autoPageNumberFormat == null) { // raw number
- xhtml.characters("\t" + pageCount);
- } else if (autoPageNumberFormat.equals("upper-roman")){
- xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
- } else if (autoPageNumberFormat.equals("lower-roman")){
- xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
- } else if (autoPageNumberFormat.equals("upper-alpha")){
- xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
- } else if (autoPageNumberFormat.equals("lower-alpha")){
- xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
- }
- }
- xhtml.endElement("div");
- }
- }
- }
- /**
- * Represents Footnotes in a document. The way these work
- * in the file format isn't very clean...
- */
- private static class Footnotes {
- /** Mark -> Text */
- Map<String,String> footnotes = new HashMap<String, String>();
- String lastSeenMark = null;
-
- /**
- * Normally happens before the text of the mark
- */
- private void recordMark(String mark) {
- lastSeenMark = mark;
- }
- private void text(String text) {
- if (lastSeenMark != null) {
- if (footnotes.containsKey(lastSeenMark)) {
- text = footnotes.get(lastSeenMark) + text;
- }
- footnotes.put(lastSeenMark, text);
- }
- }
- }
- /**
- * Represents Annotations in a document. We currently
- * just grab all the sf:p text in each one
- */
- private class Annotations {
- /** ID -> Text */
- Map<String,String> annotations = new HashMap<String, String>();
- String currentID = null;
- StringBuffer currentText = null;
-
- private void start(String id) {
- currentID = id;
- currentText = new StringBuffer();
- }
- private void text(String text) {
- if (text != null && text.length() > 0 && currentText != null) {
- currentText.append(text);
- }
- }
- private void end() {
- if (currentText.length() > 0) {
- annotations.put(currentID, currentText.toString());
- currentID = null;
- currentText = null;
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class PagesContentHandler extends DefaultHandler {
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ /** The (interesting) part of the document we're in. Should be more structured... */
+ private enum DocumentPart {
+ METADATA, PARSABLE_TEXT,
+ HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+ FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+ FOOTNOTES, ANNOTATIONS;
+ }
+ private DocumentPart inPart = null;
+ private boolean ghostText;
+
+ private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+ private boolean parseProperty = false;
+ private int pageCount = 0;
+ private int slPageCount = 0;
+
+ private HeaderFooter headers = null;
+ private HeaderFooter footers = null;
+ private Footnotes footnotes = null;
+ private Annotations annotations = null;
+
+ private Map<String, List<List<String>>> tableData =
+ new HashMap<String, List<List<String>>>();
+ private String activeTableId;
+ private int numberOfColumns = 0;
+ private List<String> activeRow = new ArrayList<String>();
+
+ private String metaDataLocalName;
+ private String metaDataQName;
+
+ PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+ if (pageCount > 0) {
+ doFooter();
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if (parseProperty) {
+ String value = parsePrimitiveElementValue(qName, attributes);
+ if (value != null) {
+ Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
+ if(metaDataKey instanceof Property) {
+ metadata.set((Property)metaDataKey, value);
+ } else {
+ metadata.add((String)metaDataKey, value);
+ }
+ }
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inPart = DocumentPart.METADATA;
+ } else if ("sf:metadata".equals(qName)) {
+ inPart = DocumentPart.METADATA;
+ } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
+ if (pageCount > 0) {
+ doFooter();
+ xhtml.endElement("div");
+ }
+ xhtml.startElement("div");
+ if ("sl:page-group".equals(qName)) {
+ slPageCount++;
+ } else {
+ pageCount++;
+ }
+ doHeader();
+ } else if ("sf:p".equals(qName)) {
+ if (pageCount+slPageCount > 0) {
+ inPart = DocumentPart.PARSABLE_TEXT;
+ xhtml.startElement("p");
+ }
+ } else if ("sf:attachment".equals(qName)) {
+ String kind = attributes.getValue("sf:kind");
+ if ("tabular-attachment".equals(kind)) {
+ activeTableId = attributes.getValue("sfa:ID");
+ tableData.put(activeTableId, new ArrayList<List<String>>());
+ }
+ } else if ("sf:attachment-ref".equals(qName)) {
+ String idRef = attributes.getValue("sfa:IDREF");
+ outputTable(idRef);
+ } else if ("sf:headers".equals(qName)) {
+ headers = new HeaderFooter(qName);
+ inPart = DocumentPart.HEADERS;
+ } else if ("sf:footers".equals(qName)) {
+ footers = new HeaderFooter(qName);
+ inPart = DocumentPart.FOOTERS;
+ } else if ("sf:header".equals(qName)) {
+ inPart = headers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:footer".equals(qName)) {
+ inPart = footers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:page-number".equals(qName)) {
+ if (inPart == DocumentPart.FOOTER_ODD
+ || inPart == DocumentPart.FOOTER_FIRST
+ || inPart == DocumentPart.FOOTER_EVEN) {
+ // We are in a footer
+ footers.hasAutoPageNumber = true;
+ footers.autoPageNumberFormat = attributes.getValue("sf:format");
+ } else {
+ headers.hasAutoPageNumber = true;
+ headers.autoPageNumberFormat = attributes.getValue("sf:format");
+ }
+
+ xhtml.characters(Integer.toString(this.pageCount));
+ } else if ("sf:footnotes".equals(qName)) {
+ footnotes = new Footnotes();
+ inPart = DocumentPart.FOOTNOTES;
+ } else if ("sf:footnote-mark".equals(qName)) {
+ footnotes.recordMark(attributes.getValue("sf:mark"));
+ } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ // What about non auto-numbered?
+ String footnoteMark = attributes.getValue("sf:autonumber");
+ if (footnotes != null) {
+ String footnoteText = footnotes.footnotes.get(footnoteMark);
+ if (footnoteText != null) {
+ xhtml.startElement("div", "style", "footnote");
+ xhtml.characters("Footnote:" ); // As shown in Pages
+ xhtml.characters(footnoteText);
+ xhtml.endElement("div");
+ }
+ }
+ } else if ("sf:annotations".equals(qName)) {
+ annotations = new Annotations();
+ inPart = DocumentPart.ANNOTATIONS;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.start(attributes.getValue("sf:target"));
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.startElement("div", "style", "annotated");
+
+ String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+ if (annotationText != null) {
+ xhtml.startElement("div", "style", "annotation");
+ xhtml.characters(annotationText);
+ xhtml.endElement("div");
+ }
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = true;
+ }
+
+ if (activeTableId != null) {
+ parseTableData(qName, attributes);
+ }
+
+ if (inPart == DocumentPart.METADATA) {
+ metaDataLocalName = localName;
+ metaDataQName = qName;
+ parseProperty = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+ metaDataLocalName = null;
+ parseProperty = false;
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inPart = null;
+ } else if ("sf:metadata".equals(qName)) {
+ inPart = null;
+ } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
+ inPart = null;
+ xhtml.endElement("p");
+ } else if ("sf:attachment".equals(qName)) {
+ activeTableId = null;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.end();
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.endElement("div");
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (length > 0) {
+ if (inPart == DocumentPart.PARSABLE_TEXT) {
+ if (!ghostText) {
+ xhtml.characters(ch, start, length);
+ }
+ } else if(inPart != null) {
+ String str = new String(ch, start, length);
+ if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+ if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
+ if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+ if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
+ if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
+ if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
+ }
+ }
+ }
+
+ private void parseTableData(String qName, Attributes attributes) {
+ if ("sf:grid".equals(qName)) {
+ String numberOfColumns = attributes.getValue("sf:numcols");
+ this.numberOfColumns = Integer.parseInt(numberOfColumns);
+ } else if ("sf:ct".equals(qName)) {
+ activeRow.add(attributes.getValue("sfa:s"));
+
+ if (activeRow.size() >= 3) {
+ tableData.get(activeTableId).add(activeRow);
+ activeRow = new ArrayList<String>();
+ }
+ }
+ }
+
+ private void outputTable(String idRef) throws SAXException {
+ List<List<String>> tableData = this.tableData.get(idRef);
+ if (tableData != null) {
+ xhtml.startElement("table");
+ for (List<String> row : tableData) {
+ xhtml.startElement("tr");
+ for (String cell : row) {
+ xhtml.element("td", cell);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+ }
+
+ /**
+ * Returns a resolved key that is common in other document types or
+ * returns the specified metaDataLocalName if no common key could be found.
+ * The key could be a simple String key, or could be a {@link Property}
+ *
+ * @param metaDataLocalName The localname of the element containing metadata
+ * @return a resolved key that is common in other document types
+ */
+ private Object resolveMetaDataKey(String metaDataLocalName) {
+ Object metaDataKey = metaDataLocalName;
+ if ("sf:authors".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.CREATOR;
+ } else if ("sf:title".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.TITLE;
+ } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.CREATED;
+ } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+ metaDataKey = Metadata.LAST_MODIFIED;
+ } else if ("sl:language".equals(metaDataQName)) {
+ metaDataKey = TikaCoreProperties.LANGUAGE;
+ }
+ return metaDataKey;
+ }
+
+ /**
+ * Returns the value of a primitive element e.g.:
+ * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute
+ * <sl:string sfa:string="en"/> = the string attribute
+ * <p>
+ * Returns <code>null</code> if the value could not be extracted from
+ * the list of attributes.
+ *
+ * @param qName The fully qualified name of the element containing
+ * the value to extract
+ * @param attributes The list of attributes of which one contains the
+ * value to be extracted
+ * @return the value of a primitive element
+ */
+ private String parsePrimitiveElementValue(
+ String qName, Attributes attributes) {
+ if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+ return attributes.getValue("sfa:string");
+ } else if ("sl:number".equals(qName)) {
+ return attributes.getValue("sfa:number");
+ } else if ("sl:date".equals(qName)) {
+ return attributes.getValue("sf:val");
+ }
+
+ return null;
+ }
+
+ private void doHeader() throws SAXException {
+ if (headers != null) {
+ headers.output("header");
+ }
+ }
+ private void doFooter() throws SAXException {
+ if (footers != null) {
+ footers.output("footer");
+ }
+ }
+
+ /**
+ * Represents the Headers or Footers in a document
+ */
+ private class HeaderFooter {
+ private String type; // sf:headers or sf:footers
+ private String defaultOdd;
+ private String defaultEven;
+ private String defaultFirst;
+ private boolean hasAutoPageNumber;
+ private String autoPageNumberFormat;
+ // TODO Can there be custom ones?
+
+ private HeaderFooter(String type) {
+ this.type = type;
+ }
+ private DocumentPart identifyPart(String name) {
+ if("SFWPDefaultOddHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_ODD;
+ if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_EVEN;
+ if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_FIRST;
+
+ if("SFWPDefaultOddFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_ODD;
+ if("SFWPDefaultEvenFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_EVEN;
+ if("SFWPDefaultFirstFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_FIRST;
+
+ return null;
+ }
+ private void output(String what) throws SAXException {
+ String text = null;
+ if (pageCount == 1 && defaultFirst != null) {
+ text = defaultFirst;
+ } else if (pageCount % 2 == 0 && defaultEven != null) {
+ text = defaultEven;
+ } else {
+ text = defaultOdd;
+ }
+
+ if (text != null) {
+ xhtml.startElement("div", "class", "header");
+ xhtml.characters(text);
+ if (hasAutoPageNumber) {
+ if (autoPageNumberFormat == null) { // raw number
+ xhtml.characters("\t" + pageCount);
+ } else if (autoPageNumberFormat.equals("upper-roman")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-roman")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
+ } else if (autoPageNumberFormat.equals("upper-alpha")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-alpha")){
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
+ }
+ }
+ xhtml.endElement("div");
+ }
+ }
+ }
+ /**
+ * Represents Footnotes in a document. The way these work
+ * in the file format isn't very clean...
+ */
+ private static class Footnotes {
+ /** Mark -> Text */
+ Map<String,String> footnotes = new HashMap<String, String>();
+ String lastSeenMark = null;
+
+ /**
+ * Normally happens before the text of the mark
+ */
+ private void recordMark(String mark) {
+ lastSeenMark = mark;
+ }
+ private void text(String text) {
+ if (lastSeenMark != null) {
+ if (footnotes.containsKey(lastSeenMark)) {
+ text = footnotes.get(lastSeenMark) + text;
+ }
+ footnotes.put(lastSeenMark, text);
+ }
+ }
+ }
+ /**
+ * Represents Annotations in a document. We currently
+ * just grab all the sf:p text in each one
+ */
+ private class Annotations {
+ /** ID -> Text */
+ Map<String,String> annotations = new HashMap<String, String>();
+ String currentID = null;
+ StringBuffer currentText = null;
+
+ private void start(String id) {
+ currentID = id;
+ currentText = new StringBuffer();
+ }
+ private void text(String text) {
+ if (text != null && text.length() > 0 && currentText != null) {
+ currentText.append(text);
+ }
+ }
+ private void end() {
+ if (currentText.length() > 0) {
+ annotations.put(currentID, currentText.toString());
+ currentID = null;
+ currentText = null;
+ }
+ }
+ }
+
+}
[30/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
index 98970d9..83d72c9 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
@@ -1,162 +1,162 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class ImageParserTest {
-
- private final Parser parser = new ImageParser();
-
- @Test
- public void testBMP() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/bmp");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testBMP.bmp");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals("75", metadata.get("height"));
- assertEquals("100", metadata.get("width"));
- assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
- assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
- //TODO: figure out why we're getting 0.35273367 in Ubuntu, but not Windows
- //assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
- //assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
- assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
- assertEquals("image/bmp", metadata.get("Content-Type"));
-
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
- }
-
- @Test
- public void testGIF() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/gif");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testGIF.gif");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals("75", metadata.get("height"));
- assertEquals("100", metadata.get("width"));
- assertEquals("true", metadata.get("Compression Lossless"));
- assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
- assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
- assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
- assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
- assertEquals("Index", metadata.get("Data SampleFormat"));
- assertEquals("3", metadata.get("Chroma NumChannels"));
- assertEquals("1", metadata.get("Compression NumProgressiveScans"));
- assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
- assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
- assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
- assertEquals("true", metadata.get("Chroma BlackIsZero"));
- assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
- assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
- assertEquals("image/gif", metadata.get("Content-Type"));
-
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
- }
-
- @Test
- public void testJPEG() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals("75", metadata.get("height"));
- assertEquals("100", metadata.get("width"));
- assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
- assertEquals("false", metadata.get("Compression Lossless"));
- assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
- assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
- assertEquals("225", metadata.get("markerSequence unknown"));
- assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
- assertEquals("normal", metadata.get("Dimension ImageOrientation"));
- assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
- assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
- assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
- assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
- assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
- assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
- assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
- assertEquals("3", metadata.get("Chroma NumChannels"));
- assertEquals("1", metadata.get("Compression NumProgressiveScans"));
- assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
- assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
- assertEquals("image/jpeg", metadata.get("Content-Type"));
- assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
-
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
- }
-
- @Test
- public void testPNG() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/png");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testPNG.png");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals("75", metadata.get("height"));
- assertEquals("100", metadata.get("width"));
- assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
- assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
- assertEquals("Perceptual", metadata.get("sRGB"));
- assertEquals("true", metadata.get("Compression Lossless"));
- assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
- assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
- assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
- assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
- assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
- assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
- assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
- assertEquals("none", metadata.get("Transparency Alpha"));
- assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
- assertEquals("3", metadata.get("Chroma NumChannels"));
- assertEquals("1", metadata.get("Compression NumProgressiveScans"));
- assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
- assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
- assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
- assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
- assertEquals("true", metadata.get("Chroma BlackIsZero"));
- assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
- assertEquals("image/png", metadata.get("Content-Type"));
-
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class ImageParserTest {
+
+ private final Parser parser = new ImageParser();
+
+ @Test
+ public void testBMP() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/bmp");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testBMP.bmp");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("75", metadata.get("height"));
+ assertEquals("100", metadata.get("width"));
+ assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ //TODO: figure out why we're getting 0.35273367 in Ubuntu, but not Windows
+ //assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
+ //assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
+ assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
+ assertEquals("image/bmp", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ }
+
+ @Test
+ public void testGIF() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/gif");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testGIF.gif");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("75", metadata.get("height"));
+ assertEquals("100", metadata.get("width"));
+ assertEquals("true", metadata.get("Compression Lossless"));
+ assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
+ assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
+ assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
+ assertEquals("Index", metadata.get("Data SampleFormat"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
+ assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+ assertEquals("true", metadata.get("Chroma BlackIsZero"));
+ assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
+ assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
+ assertEquals("image/gif", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
+ }
+
+ @Test
+ public void testJPEG() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("75", metadata.get("height"));
+ assertEquals("100", metadata.get("width"));
+ assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
+ assertEquals("false", metadata.get("Compression Lossless"));
+ assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
+ assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
+ assertEquals("225", metadata.get("markerSequence unknown"));
+ assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
+ assertEquals("normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
+ assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
+ assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
+ assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
+ assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
+ assertEquals("image/jpeg", metadata.get("Content-Type"));
+ assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
+ }
+
+ @Test
+ public void testPNG() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/png");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testPNG.png");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("75", metadata.get("height"));
+ assertEquals("100", metadata.get("width"));
+ assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
+ assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+ assertEquals("Perceptual", metadata.get("sRGB"));
+ assertEquals("true", metadata.get("Compression Lossless"));
+ assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
+ assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+ assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+ assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
+ assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
+ assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
+ assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
+ assertEquals("none", metadata.get("Transparency Alpha"));
+ assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
+ assertEquals("3", metadata.get("Chroma NumChannels"));
+ assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+ assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+ assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+ assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
+ assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
+ assertEquals("true", metadata.get("Chroma BlackIsZero"));
+ assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
+ assertEquals("image/png", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
index b78a831..7e3a123 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.metadata.TIFF;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class MetadataFieldsTest {
-
- @Test
- public void testIsMetadataField() {
- assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
- assertFalse(MetadataFields.isMetadataField("xyz"));
- assertTrue(MetadataFields.isMetadataField(TikaCoreProperties.KEYWORDS));
- assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class MetadataFieldsTest {
+
+ @Test
+ public void testIsMetadataField() {
+ assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
+ assertFalse(MetadataFields.isMetadataField("xyz"));
+ assertTrue(MetadataFields.isMetadataField(TikaCoreProperties.KEYWORDS));
+ assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
index 239c160..d506c33 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
@@ -1,66 +1,66 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TiffParserTest {
- private final Parser parser = new TiffParser();
-
- @Test
- public void testTIFF() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testTIFF.tif");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
- "more contributor license agreements. See the NOTICE file " +
- "distributed with this work for additional information regarding " +
- "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION));
-
- // All EXIF/TIFF tags
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
- // Core EXIF/TIFF tags
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
- assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
- // Embedded XMP
- List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue("got " + keywords, keywords.contains("cat"));
- assertTrue("got " + keywords, keywords.contains("garden"));
- List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
- assertTrue("got " + subject, subject.contains("cat"));
- assertTrue("got " + subject, subject.contains("garden"));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TiffParserTest {
+ private final Parser parser = new TiffParser();
+
+ @Test
+ public void testTIFF() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testTIFF.tif");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
+ "more contributor license agreements. See the NOTICE file " +
+ "distributed with this work for additional information regarding " +
+ "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ // All EXIF/TIFF tags
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ // Embedded XMP
+ List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue("got " + keywords, keywords.contains("cat"));
+ assertTrue("got " + keywords, keywords.contains("garden"));
+ List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("got " + subject, subject.contains("cat"));
+ assertTrue("got " + subject, subject.contains("garden"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index f65c797..b189fd7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -1,284 +1,284 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.jpeg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.TimeZone;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TIFF;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPMM;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class JpegParserTest {
-
- private final Parser parser = new JpegParser();
- static TimeZone CURR_TIME_ZONE = TimeZone.getDefault();
-
- //As of Drew Noakes' metadata-extractor 2.8.1,
- //unspecified timezones appear to be set to
- //TimeZone.getDefault(). We need to normalize this
- //for testing across different time zones.
- //We also appear to have to specify it in the surefire config:
- //<argLine>-Duser.timezone=UTC</argLine>
- @BeforeClass
- public static void setDefaultTimeZone() {
- TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
- }
- @AfterClass
- public static void resetDefaultTimeZone() {
- TimeZone.setDefault(CURR_TIME_ZONE);
- }
- @Test
- public void testJPEG() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // Core EXIF/TIFF tags
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
- assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
- assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
- assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
- assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
- assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
- assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
- assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
- assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
- assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
- assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
- assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
- assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
- // Check that EXIF/TIFF tags come through with their raw values too
- // (This may be removed for Tika 1.0, as we support more of them
- // with explicit Metadata entries)
- assertEquals("Canon EOS 40D", metadata.get("Model"));
-
- // Common tags
- assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
- assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
- "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
- List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
- assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
- assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
- assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
- List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
- assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
- assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
- assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
- assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
- }
-
- /**
- * Test for a file with Geographic information (lat, long etc) in it
- */
- @Test
- public void testJPEGGeo() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // Geo tags
- assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
- assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
-
- // Core EXIF/TIFF tags
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
- assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
- assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
- assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
- assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
- assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
- assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
- assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
- assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
- assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
- assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
- assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
- assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
- // Common tags
- assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
- "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
- "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
- assertEquals("Date/Time Original should be stored in EXIF field too",
- "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
- assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
- assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
- }
-
- /**
- * Test for an image with the geographic information stored in a slightly
- * different way, see TIKA-915 for details
- * Disabled for now, pending a fix to the underlying library
- */
- @Test
- public void testJPEGGeo2() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // Geo tags should be there with 5dp, and not rounded
- assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
- assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
- }
-
- @Test
- public void testJPEGTitleAndDescription() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // embedded comments with non-ascii characters
- assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
- // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
- // but we have to replace them with underscore
-
- List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
- assertTrue(keywords.contains("coast"));
- assertTrue(keywords.contains("bird watching"));
- assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
-
- // Core EXIF/TIFF tags
- assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
- assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
- assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
- assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
- assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
- assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
- assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
- assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
- assertEquals(null, metadata.get(Metadata.SOFTWARE));
- assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
- assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
- assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
- }
-
- @Test
- public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // embedded comments with non-ascii characters
- assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
- List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue("got " + keywords, keywords.contains("bird watching"));
- List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
- assertTrue("got " + subject, subject.contains("bird watching"));
- }
-
- @Test
- public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // XnViewMp's default comment dialog has only comment, not headline.
- // Comment is embedded only if "Write comments in XMP" is enabled in settings
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
- // but we have to replace them with underscore
- String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
- List<String> keywords = Arrays.asList(subject);
- assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
- assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
- }
-
- @Test
- public void testJPEGoddTagComponent() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
- assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
- }
-
- @Test
- public void testJPEGEmptyEXIFDateTime() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
- assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
- assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
- }
-
- @Test
- public void testJPEGXMPMM() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- //TODO: when jempbox is fixed/xmpbox is used
- //add tests for history...currently not extracted
- assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
- metadata.get(XMPMM.DOCUMENTID));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TimeZone;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class JpegParserTest {
+
+ private final Parser parser = new JpegParser();
+ static TimeZone CURR_TIME_ZONE = TimeZone.getDefault();
+
+ //As of Drew Noakes' metadata-extractor 2.8.1,
+ //unspecified timezones appear to be set to
+ //TimeZone.getDefault(). We need to normalize this
+ //for testing across different time zones.
+ //We also appear to have to specify it in the surefire config:
+ //<argLine>-Duser.timezone=UTC</argLine>
+ @BeforeClass
+ public static void setDefaultTimeZone() {
+ TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
+ }
+ @AfterClass
+ public static void resetDefaultTimeZone() {
+ TimeZone.setDefault(CURR_TIME_ZONE);
+ }
+ @Test
+ public void testJPEG() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
+ assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
+ assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
+ assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
+ assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
+ assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
+ assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
+ assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
+ assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+ // Check that EXIF/TIFF tags come through with their raw values too
+ // (This may be removed for Tika 1.0, as we support more of them
+ // with explicit Metadata entries)
+ assertEquals("Canon EOS 40D", metadata.get("Model"));
+
+ // Common tags
+ assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+ "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
+ List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
+ assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
+ assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
+ assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
+ List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
+ assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
+ assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
+ assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
+ }
+
+ /**
+ * Test for a file with Geographic information (lat, long etc) in it
+ */
+ @Test
+ public void testJPEGGeo() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // Geo tags
+ assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+ assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
+ assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
+ assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
+ assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
+ assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
+ assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
+ assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
+ assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
+ assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+
+ // Common tags
+ assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+ "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
+ "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("Date/Time Original should be stored in EXIF field too",
+ "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
+ assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+ assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
+ }
+
+ /**
+ * Test for an image with the geographic information stored in a slightly
+ * different way, see TIKA-915 for details
+ * Disabled for now, pending a fix to the underlying library
+ */
+ @Test
+ public void testJPEGGeo2() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // Geo tags should be there with 5dp, and not rounded
+ assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
+ assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
+ }
+
+ @Test
+ public void testJPEGTitleAndDescription() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // embedded comments with non-ascii characters
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
+ // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+ // but we have to replace them with underscore
+
+ List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("bird watching"));
+ assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
+
+ // Core EXIF/TIFF tags
+ assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
+ assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
+ assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
+ assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
+ assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
+ assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
+ assertEquals(null, metadata.get(Metadata.SOFTWARE));
+ assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
+ assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+ assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ }
+
+ @Test
+ public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // embedded comments with non-ascii characters
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+ List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue("got " + keywords, keywords.contains("bird watching"));
+ List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("got " + subject, subject.contains("bird watching"));
+ }
+
+ @Test
+ public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // XnViewMp's default comment dialog has only comment, not headline.
+ // Comment is embedded only if "Write comments in XMP" is enabled in settings
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+ // but we have to replace them with underscore
+ String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
+ List<String> keywords = Arrays.asList(subject);
+ assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
+ assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
+ }
+
+ @Test
+ public void testJPEGoddTagComponent() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
+ }
+
+ @Test
+ public void testJPEGEmptyEXIFDateTime() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+ assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
+ assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
+ }
+
+ @Test
+ public void testJPEGXMPMM() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ //TODO: when jempbox is fixed/xmpbox is used
+ //add tests for history...currently not extracted
+ assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
+ metadata.get(XMPMM.DOCUMENTID));
+ }
+
+}
[18/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
index 65894e3..e337c15 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java
@@ -1,85 +1,85 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests public methods of the DirectoryListingEntry class
- *
- * @author olegt
- *
- */
-public class TestDirectoryListingEntry {
- private DirectoryListingEntry dle = null;
-
- @Before
- public void setUp() throws Exception {
- dle = new DirectoryListingEntry(TestParameters.nameLength,
- TestParameters.entryName, TestParameters.entryType,
- TestParameters.offset, TestParameters.length);
- }
-
- @Test
- public void testDefaultConstructor() {
- assertNotNull(dle);
- }
-
- @Test
- public void testParamConstructor() {
- assertEquals(TestParameters.nameLength, dle.getNameLength());
- assertEquals(TestParameters.entryName, dle.getName());
- assertEquals(TestParameters.entryType, dle.getEntryType());
- assertEquals(TestParameters.offset, dle.getOffset());
- assertEquals(TestParameters.length, dle.getLength());
- }
-
- @Test
- public void testToString() {
- assertNotNull(dle.toString());
- }
-
- @Test
- public void testGetNameLength() {
- assertEquals(TestParameters.nameLength, dle.getNameLength());
- }
-
- @Test
- public void testGetName() {
- assertEquals(TestParameters.entryName, dle.getName());
- }
-
- @Test
- public void testGetEntryType() {
- assertEquals(TestParameters.entryType, dle.getEntryType());
- }
-
- @Test
- public void testGetOffset() {
- assertEquals(TestParameters.offset, dle.getOffset());
- }
-
- @Test
- public void testGetLength() {
- assertEquals(TestParameters.length, dle.getLength());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests public methods of the DirectoryListingEntry class
+ *
+ * @author olegt
+ *
+ */
+public class TestDirectoryListingEntry {
+ private DirectoryListingEntry dle = null;
+
+ @Before
+ public void setUp() throws Exception {
+ dle = new DirectoryListingEntry(TestParameters.nameLength,
+ TestParameters.entryName, TestParameters.entryType,
+ TestParameters.offset, TestParameters.length);
+ }
+
+ @Test
+ public void testDefaultConstructor() {
+ assertNotNull(dle);
+ }
+
+ @Test
+ public void testParamConstructor() {
+ assertEquals(TestParameters.nameLength, dle.getNameLength());
+ assertEquals(TestParameters.entryName, dle.getName());
+ assertEquals(TestParameters.entryType, dle.getEntryType());
+ assertEquals(TestParameters.offset, dle.getOffset());
+ assertEquals(TestParameters.length, dle.getLength());
+ }
+
+ @Test
+ public void testToString() {
+ assertNotNull(dle.toString());
+ }
+
+ @Test
+ public void testGetNameLength() {
+ assertEquals(TestParameters.nameLength, dle.getNameLength());
+ }
+
+ @Test
+ public void testGetName() {
+ assertEquals(TestParameters.entryName, dle.getName());
+ }
+
+ @Test
+ public void testGetEntryType() {
+ assertEquals(TestParameters.entryType, dle.getEntryType());
+ }
+
+ @Test
+ public void testGetOffset() {
+ assertEquals(TestParameters.offset, dle.getOffset());
+ }
+
+ @Test
+ public void testGetLength() {
+ assertEquals(TestParameters.length, dle.getLength());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
index 2512e85..5937d18 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestParameters.java
@@ -1,104 +1,104 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
-
-/**
- * Holds test parameters such as verification points
- */
-public class TestParameters {
- /* Prevents initialization */
- private TestParameters() {
- }
-
- /* Tests values */
- static final int nameLength = 5;
- static final String entryName = TestParameters.class.getName();
- static EntryType entryType = EntryType.COMPRESSED;
- static final int offset = 3;
- static final int length = 20;
- static final int NTHREADS = 2;
-
- static final int BUFFER_SIZE = 16384;
-
- static final byte[] chmData = readResource("/test-documents/testChm.chm");
-
- private static byte[] readResource(String name) {
- try {
- try (InputStream stream = TestParameters.class.getResourceAsStream(name)) {
- return IOUtils.toByteArray(stream);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- /* Verification points */
- static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm";
- static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments";
- static final String VP_ISTF_SIGNATURE = "ITSF";
- static final String VP_ISTP_SIGNATURE = "ITSP";
- static final String VP_PMGL_SIGNATURE = "PMGL";
- static final String VP_CONTROL_DATA_SIGNATURE = "LZXC";
-
- static final int VP_DIRECTORY_LENGTH = 4180;
- static final int VP_DATA_OFFSET_LENGTH = 4300;
- static final int VP_DIRECTORY_OFFSET = 120;
- static final int VP_ITSF_HEADER_LENGTH = 96;
- static final int VP_LANGUAGE_ID = 1033;
- static final int VP_LAST_MODIFIED = 1042357880;
- static final int VP_UNKNOWN_000C = 1;
- static final int VP_UNKNOWN_LEN = 24;
- static final int VP_UNKNOWN_OFFSET = 96;
- static final int VP_VERSION = 3;
- static final int VP_BLOCK_LENGTH = 4096;
- static final int VP_BLOCK_INDEX_INTERVAL = 2;
- static final int VP_ITSP_HEADER_LENGTH = 84;
- static final int VP_INDEX_DEPTH = 1;
- static final int VP_INDEX_HEAD = 0;
- static final int VP_INDEX_ROOT = -1;
- static final int VP_UNKNOWN_NUM_BLOCKS = -1;
- static final int VP_ITSP_UNKNOWN_000C = 10;
- static final int VP_ITSP_UNKNOWN_0024 = 0;
- static final int VP_ITSP_UNKNOWN_002C = 1;
- static final int VP_ITSP_BYTEARR_LEN = 16;
- static final int VP_ITSP_VERSION = 1;
- static final int VP_RESET_INTERVAL = 2;
- static final int VP_CONTROL_DATA_SIZE = 6;
- static final int VP_UNKNOWN_18 = 0;
- static final int VP_CONTROL_DATA_VERSION = 2;
- static final int VP_WINDOW_SIZE = 65536;
- static final int VP_WINDOWS_PER_RESET = 1;
- static final int VP_CHM_ENTITIES_NUMBER = 100; //updated by Hawking
- static final int VP_PMGI_FREE_SPACE = 3;
- static final int VP_PMGL_BLOCK_NEXT = -1;
- static final int VP_PMGL_BLOCK_PREV = -1;
- static final int VP_PMGL_FREE_SPACE = 1644;
- static final int VP_PMGL_UNKNOWN_008 = 0;
- static final int VP_RESET_TABLE_BA = 12;
- static final int VP_RES_TBL_BLOCK_LENGTH = 32768;
- static final int VP_RES_TBL_COMPR_LENGTH = 177408;
- static final int VP_RES_TBL_UNCOMP_LENGTH = 383786;
- static final int VP_TBL_OFFSET = 40;
- static final int VP_RES_TBL_UNKNOWN = 8;
- static final int VP_RES_TBL_VERSION = 2;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+
+/**
+ * Holds test parameters such as verification points
+ */
+public class TestParameters {
+ /* Prevents initialization */
+ private TestParameters() {
+ }
+
+ /* Tests values */
+ static final int nameLength = 5;
+ static final String entryName = TestParameters.class.getName();
+ static EntryType entryType = EntryType.COMPRESSED;
+ static final int offset = 3;
+ static final int length = 20;
+ static final int NTHREADS = 2;
+
+ static final int BUFFER_SIZE = 16384;
+
+ static final byte[] chmData = readResource("/test-documents/testChm.chm");
+
+ private static byte[] readResource(String name) {
+ try {
+ try (InputStream stream = TestParameters.class.getResourceAsStream(name)) {
+ return IOUtils.toByteArray(stream);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /* Verification points */
+ static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm";
+ static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments";
+ static final String VP_ISTF_SIGNATURE = "ITSF";
+ static final String VP_ISTP_SIGNATURE = "ITSP";
+ static final String VP_PMGL_SIGNATURE = "PMGL";
+ static final String VP_CONTROL_DATA_SIGNATURE = "LZXC";
+
+ static final int VP_DIRECTORY_LENGTH = 4180;
+ static final int VP_DATA_OFFSET_LENGTH = 4300;
+ static final int VP_DIRECTORY_OFFSET = 120;
+ static final int VP_ITSF_HEADER_LENGTH = 96;
+ static final int VP_LANGUAGE_ID = 1033;
+ static final int VP_LAST_MODIFIED = 1042357880;
+ static final int VP_UNKNOWN_000C = 1;
+ static final int VP_UNKNOWN_LEN = 24;
+ static final int VP_UNKNOWN_OFFSET = 96;
+ static final int VP_VERSION = 3;
+ static final int VP_BLOCK_LENGTH = 4096;
+ static final int VP_BLOCK_INDEX_INTERVAL = 2;
+ static final int VP_ITSP_HEADER_LENGTH = 84;
+ static final int VP_INDEX_DEPTH = 1;
+ static final int VP_INDEX_HEAD = 0;
+ static final int VP_INDEX_ROOT = -1;
+ static final int VP_UNKNOWN_NUM_BLOCKS = -1;
+ static final int VP_ITSP_UNKNOWN_000C = 10;
+ static final int VP_ITSP_UNKNOWN_0024 = 0;
+ static final int VP_ITSP_UNKNOWN_002C = 1;
+ static final int VP_ITSP_BYTEARR_LEN = 16;
+ static final int VP_ITSP_VERSION = 1;
+ static final int VP_RESET_INTERVAL = 2;
+ static final int VP_CONTROL_DATA_SIZE = 6;
+ static final int VP_UNKNOWN_18 = 0;
+ static final int VP_CONTROL_DATA_VERSION = 2;
+ static final int VP_WINDOW_SIZE = 65536;
+ static final int VP_WINDOWS_PER_RESET = 1;
+ static final int VP_CHM_ENTITIES_NUMBER = 100; //updated by Hawking
+ static final int VP_PMGI_FREE_SPACE = 3;
+ static final int VP_PMGL_BLOCK_NEXT = -1;
+ static final int VP_PMGL_BLOCK_PREV = -1;
+ static final int VP_PMGL_FREE_SPACE = 1644;
+ static final int VP_PMGL_UNKNOWN_008 = 0;
+ static final int VP_RESET_TABLE_BA = 12;
+ static final int VP_RES_TBL_BLOCK_LENGTH = 32768;
+ static final int VP_RES_TBL_COMPR_LENGTH = 177408;
+ static final int VP_RES_TBL_UNCOMP_LENGTH = 383786;
+ static final int VP_TBL_OFFSET = 40;
+ static final int VP_RES_TBL_UNKNOWN = 8;
+ static final int VP_RES_TBL_VERSION = 2;
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
index 493c03e..070583b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java
@@ -1,45 +1,45 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmPmgiHeader;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestPmgiHeader {
- ChmPmgiHeader chmPmgiHeader = null;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
- chmPmgiHeader = new ChmPmgiHeader();
- chmPmgiHeader.parse(data, chmPmgiHeader);
- }
-
- @Test
- public void testToString() {
- assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0));
- }
-
- @Test
- public void testGetFreeSpace() {
- assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmPmgiHeader;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestPmgiHeader {
+ ChmPmgiHeader chmPmgiHeader = null;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+ chmPmgiHeader = new ChmPmgiHeader();
+ chmPmgiHeader.parse(data, chmPmgiHeader);
+ }
+
+ @Test
+ public void testToString() {
+ assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0));
+ }
+
+ @Test
+ public void testGetFreeSpace() {
+ assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
index f8652da..55c08f2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java
@@ -1,76 +1,76 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmPmglHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestPmglHeader {
- ChmPmglHeader chmPmglHeader = null;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
- chmPmglHeader = new ChmPmglHeader();
- chmPmglHeader.parse(ChmCommons.copyOfRange(data,
- ChmConstants.START_PMGL, ChmConstants.START_PMGL
- + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader);
- }
-
- @Test
- public void testToString() {
- assertTrue((chmPmglHeader != null)
- && chmPmglHeader.toString().length() > 0);
- }
-
- @Test
- public void testChmPmglHeaderGet() {
- assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String(
- chmPmglHeader.getSignature(), UTF_8));
- }
-
- @Test
- public void testGetBlockNext() {
- assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT,
- chmPmglHeader.getBlockNext());
- }
-
- @Test
- public void testGetBlockPrev() {
- assertEquals(TestParameters.VP_PMGL_BLOCK_PREV,
- chmPmglHeader.getBlockPrev());
- }
-
- @Test
- public void testGetFreeSpace() {
- assertEquals(TestParameters.VP_PMGL_FREE_SPACE,
- chmPmglHeader.getFreeSpace());
- }
-
- @Test
- public void testGetUnknown0008() {
- assertEquals(TestParameters.VP_PMGL_UNKNOWN_008,
- chmPmglHeader.getUnknown0008());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmPmglHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestPmglHeader {
+ ChmPmglHeader chmPmglHeader = null;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+ chmPmglHeader = new ChmPmglHeader();
+ chmPmglHeader.parse(ChmCommons.copyOfRange(data,
+ ChmConstants.START_PMGL, ChmConstants.START_PMGL
+ + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader);
+ }
+
+ @Test
+ public void testToString() {
+ assertTrue((chmPmglHeader != null)
+ && chmPmglHeader.toString().length() > 0);
+ }
+
+ @Test
+ public void testChmPmglHeaderGet() {
+ assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String(
+ chmPmglHeader.getSignature(), UTF_8));
+ }
+
+ @Test
+ public void testGetBlockNext() {
+ assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT,
+ chmPmglHeader.getBlockNext());
+ }
+
+ @Test
+ public void testGetBlockPrev() {
+ assertEquals(TestParameters.VP_PMGL_BLOCK_PREV,
+ chmPmglHeader.getBlockPrev());
+ }
+
+ @Test
+ public void testGetFreeSpace() {
+ assertEquals(TestParameters.VP_PMGL_FREE_SPACE,
+ chmPmglHeader.getFreeSpace());
+ }
+
+ @Test
+ public void testGetUnknown0008() {
+ assertEquals(TestParameters.VP_PMGL_UNKNOWN_008,
+ chmPmglHeader.getUnknown0008());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 78761fe..6ef803d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-import java.util.Map;
-
-import org.apache.tika.detect.TypeDetector;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class MboxParserTest {
-
- protected ParseContext recursingContext;
- private Parser autoDetectParser;
- private TypeDetector typeDetector;
- private MboxParser mboxParser;
-
- private static InputStream getStream(String name) {
- return MboxParserTest.class.getClass().getResourceAsStream(name);
- }
-
- @Before
- public void setUp() throws Exception {
- typeDetector = new TypeDetector();
- autoDetectParser = new AutoDetectParser(typeDetector);
- recursingContext = new ParseContext();
- recursingContext.set(Parser.class, autoDetectParser);
-
- mboxParser = new MboxParser();
- mboxParser.setTracking(true);
- }
-
- @Test
- public void testSimple() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getStream("/test-documents/simple.mbox")) {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- }
-
- String content = handler.toString();
- assertContains("Test content 1", content);
- assertContains("Test content 2", content);
- assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
-
- Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
- assertEquals("Nb. Of mails", 2, mailsMetadata.size());
-
- Metadata mail1 = mailsMetadata.get(0);
- assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
- assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
-
- Metadata mail2 = mailsMetadata.get(1);
- assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
- assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
- }
-
- @Test
- public void testHeaders() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getStream("/test-documents/headers.mbox")) {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertContains("Test content", handler.toString());
- assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
- Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
-
- assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
- assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
- assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
- assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
- assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals("author@domain.com", mailMetadata.get("Message-From"));
- assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
- }
-
- @Test
- public void testMultilineHeader() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
- Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
- assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
- }
-
- @Test
- public void testQuoted() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertContains("Test content", handler.toString());
- assertContains("> quoted stuff", handler.toString());
- }
-
- @Test
- public void testComplex() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getStream("/test-documents/complex.mbox")) {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
-
- Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
- assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
- assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
- assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
- assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
- assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
-
- assertContains("When a Mapper completes", handler.toString());
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class MboxParserTest {
+
+ protected ParseContext recursingContext;
+ private Parser autoDetectParser;
+ private TypeDetector typeDetector;
+ private MboxParser mboxParser;
+
+ private static InputStream getStream(String name) {
+ return MboxParserTest.class.getClass().getResourceAsStream(name);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ typeDetector = new TypeDetector();
+ autoDetectParser = new AutoDetectParser(typeDetector);
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+
+ mboxParser = new MboxParser();
+ mboxParser.setTracking(true);
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/simple.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ String content = handler.toString();
+ assertContains("Test content 1", content);
+ assertContains("Test content 2", content);
+ assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+ Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
+ assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+ Metadata mail1 = mailsMetadata.get(0);
+ assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
+
+ Metadata mail2 = mailsMetadata.get(1);
+ assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
+ }
+
+ @Test
+ public void testHeaders() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/headers.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertContains("Test content", handler.toString());
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+ assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
+ assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+ assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
+ assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("author@domain.com", mailMetadata.get("Message-From"));
+ assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
+ }
+
+ @Test
+ public void testMultilineHeader() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
+ }
+
+ @Test
+ public void testQuoted() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertContains("Test content", handler.toString());
+ assertContains("> quoted stuff", handler.toString());
+ }
+
+ @Test
+ public void testComplex() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/complex.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
+
+ Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
+ assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+ assertContains("When a Mapper completes", handler.toString());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
index 1d2904c..89a1b86 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
@@ -1,110 +1,110 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.ToHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class OutlookPSTParserTest extends TikaTest {
-
- private Parser parser = new OutlookPSTParser();
-
- @Test
- public void testAccept() throws Exception {
- assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
- }
-
- @Test
- public void testParse() throws Exception {
- Parser pstParser = new AutoDetectParser();
- Metadata metadata = new Metadata();
- ContentHandler handler = new ToHTMLContentHandler();
-
- ParseContext context = new ParseContext();
- EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
- context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
- context.set(Parser.class, new AutoDetectParser());
-
- pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
-
- String output = handler.toString();
-
- assertFalse(output.isEmpty());
- assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
- assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
-
- assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
- assertTrue(output.contains("<div class=\"embedded\" id=\"<530D9CAC.5080901@gmail.com>\"><h1>Re: Feature Generators</h1>"));
- assertTrue(output.contains("<div class=\"embedded\" id=\"<1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
- assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
-
- assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
-
-
- List<Metadata> metaList = trackingExtrator.trackingMetadata;
- assertEquals(6, metaList.size());
-
- Metadata firstMail = metaList.get(0);
- assertEquals("J�rn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
- assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
- assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
- assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
- assertEquals("", firstMail.get("displayCC"));
- assertEquals("", firstMail.get("displayBCC"));
- }
-
-
- private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
- List<Metadata> trackingMetadata = new ArrayList<Metadata>();
-
- public EmbeddedTrackingExtrator(ParseContext context) {
- super(context);
- }
-
- @Override
- public boolean shouldParseEmbedded(Metadata metadata) {
- return true;
- }
-
- @Override
- public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
- this.trackingMetadata.add(metadata);
- super.parseEmbedded(stream, handler, metadata, outputHtml);
- }
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class OutlookPSTParserTest extends TikaTest {
+
+ private Parser parser = new OutlookPSTParser();
+
+ @Test
+ public void testAccept() throws Exception {
+ assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
+ }
+
+ @Test
+ public void testParse() throws Exception {
+ Parser pstParser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new ToHTMLContentHandler();
+
+ ParseContext context = new ParseContext();
+ EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
+ context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
+ context.set(Parser.class, new AutoDetectParser());
+
+ pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
+
+ String output = handler.toString();
+
+ assertFalse(output.isEmpty());
+ assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
+ assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
+
+ assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
+ assertTrue(output.contains("<div class=\"embedded\" id=\"<530D9CAC.5080901@gmail.com>\"><h1>Re: Feature Generators</h1>"));
+ assertTrue(output.contains("<div class=\"embedded\" id=\"<1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
+ assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
+
+ assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
+
+
+ List<Metadata> metaList = trackingExtrator.trackingMetadata;
+ assertEquals(6, metaList.size());
+
+ Metadata firstMail = metaList.get(0);
+ assertEquals("J�rn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
+ assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
+ assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
+ assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
+ assertEquals("", firstMail.get("displayCC"));
+ assertEquals("", firstMail.get("displayBCC"));
+ }
+
+
+ private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
+ List<Metadata> trackingMetadata = new ArrayList<Metadata>();
+
+ public EmbeddedTrackingExtrator(ParseContext context) {
+ super(context);
+ }
+
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+ this.trackingMetadata.add(metadata);
+ super.parseEmbedded(stream, handler, metadata, outputHtml);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
index 2b3d141..f454446 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
@@ -1,75 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import java.net.URL;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Parent class of tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public abstract class AbstractPOIContainerExtractionTest extends TikaTest {
- public static final MediaType TYPE_DOC = MediaType.application("msword");
- public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
- public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
- public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
- public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
- public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
-
- public static final MediaType TYPE_TXT = MediaType.text("plain");
- public static final MediaType TYPE_PDF = MediaType.application("pdf");
-
- public static final MediaType TYPE_JPG = MediaType.image("jpeg");
- public static final MediaType TYPE_GIF = MediaType.image("gif");
- public static final MediaType TYPE_PNG = MediaType.image("png");
- public static final MediaType TYPE_EMF = MediaType.application("x-emf");
- public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
-
- protected static TikaInputStream getTestFile(String filename) throws Exception {
- URL input = AbstractPOIContainerExtractionTest.class.getResource(
- "/test-documents/" + filename);
- assertNotNull(filename + " not found", input);
-
- return TikaInputStream.get(input);
- }
-
- protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
- try (TikaInputStream stream = getTestFile(filename)) {
- assertEquals(true, extractor.isSupported(stream));
-
- // Process it
- TrackingHandler handler = new TrackingHandler();
- if (recurse) {
- extractor.extract(stream, extractor, handler);
- } else {
- extractor.extract(stream, null, handler);
- }
-
- // So they can check what happened
- return handler;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.net.URL;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Parent class of tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public abstract class AbstractPOIContainerExtractionTest extends TikaTest {
+ public static final MediaType TYPE_DOC = MediaType.application("msword");
+ public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
+ public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+ public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+ public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
+
+ public static final MediaType TYPE_TXT = MediaType.text("plain");
+ public static final MediaType TYPE_PDF = MediaType.application("pdf");
+
+ public static final MediaType TYPE_JPG = MediaType.image("jpeg");
+ public static final MediaType TYPE_GIF = MediaType.image("gif");
+ public static final MediaType TYPE_PNG = MediaType.image("png");
+ public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+ public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
+
+ protected static TikaInputStream getTestFile(String filename) throws Exception {
+ URL input = AbstractPOIContainerExtractionTest.class.getResource(
+ "/test-documents/" + filename);
+ assertNotNull(filename + " not found", input);
+
+ return TikaInputStream.get(input);
+ }
+
+ protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
+ try (TikaInputStream stream = getTestFile(filename)) {
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ if (recurse) {
+ extractor.extract(stream, extractor, handler);
+ } else {
+ extractor.extract(stream, null, handler);
+ }
+
+ // So they can check what happened
+ return handler;
+ }
+ }
+}
[34/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
index 16dd37f..a6c2e9d 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
@@ -1,139 +1,139 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import javax.sound.sampled.AudioFileFormat;
-import javax.sound.sampled.AudioFileFormat.Type;
-import javax.sound.sampled.AudioFormat;
-import javax.sound.sampled.AudioSystem;
-import javax.sound.sampled.UnsupportedAudioFileException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class AudioParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -6015684081240882695L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.audio("basic"),
- MediaType.audio("x-wav"),
- MediaType.audio("x-aiff"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // AudioSystem expects the stream to support the mark feature
- if (!stream.markSupported()) {
- stream = new BufferedInputStream(stream);
- }
- try {
- AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
- Type type = fileFormat.getType();
- if (type == Type.AIFC || type == Type.AIFF) {
- metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
- } else if (type == Type.AU || type == Type.SND) {
- metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
- } else if (type == Type.WAVE) {
- metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
- }
-
- AudioFormat audioFormat = fileFormat.getFormat();
- int channels = audioFormat.getChannels();
- if (channels != AudioSystem.NOT_SPECIFIED) {
- metadata.set("channels", String.valueOf(channels));
- // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
- }
- float rate = audioFormat.getSampleRate();
- if (rate != AudioSystem.NOT_SPECIFIED) {
- metadata.set("samplerate", String.valueOf(rate));
- metadata.set(
- XMPDM.AUDIO_SAMPLE_RATE,
- Integer.toString((int) rate));
- }
- int bits = audioFormat.getSampleSizeInBits();
- if (bits != AudioSystem.NOT_SPECIFIED) {
- metadata.set("bits", String.valueOf(bits));
- if (bits == 8) {
- metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
- } else if (bits == 16) {
- metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
- } else if (bits == 32) {
- metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
- }
- }
- metadata.set("encoding", audioFormat.getEncoding().toString());
-
- // Javadoc suggests that some of the following properties might
- // be available, but I had no success in finding any:
-
- // "duration" Long playback duration of the file in microseconds
- // "author" String name of the author of this file
- // "title" String title of this file
- // "copyright" String copyright message
- // "date" Date date of the recording or release
- // "comment" String an arbitrary text
-
- addMetadata(metadata, fileFormat.properties());
- addMetadata(metadata, audioFormat.properties());
- } catch (UnsupportedAudioFileException e) {
- // There is no way to know whether this exception was
- // caused by the document being corrupted or by the format
- // just being unsupported. So we do nothing.
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
- private void addMetadata(Metadata metadata, Map<String, Object> properties) {
- if (properties != null) {
- for (Entry<String, Object> entry : properties.entrySet()) {
- Object value = entry.getValue();
- if (value != null) {
- metadata.set(entry.getKey(), value.toString());
- }
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import javax.sound.sampled.AudioFileFormat;
+import javax.sound.sampled.AudioFileFormat.Type;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.UnsupportedAudioFileException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AudioParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -6015684081240882695L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.audio("basic"),
+ MediaType.audio("x-wav"),
+ MediaType.audio("x-aiff"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // AudioSystem expects the stream to support the mark feature
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+ try {
+ AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
+ Type type = fileFormat.getType();
+ if (type == Type.AIFC || type == Type.AIFF) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
+ } else if (type == Type.AU || type == Type.SND) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
+ } else if (type == Type.WAVE) {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
+ }
+
+ AudioFormat audioFormat = fileFormat.getFormat();
+ int channels = audioFormat.getChannels();
+ if (channels != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("channels", String.valueOf(channels));
+ // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
+ }
+ float rate = audioFormat.getSampleRate();
+ if (rate != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("samplerate", String.valueOf(rate));
+ metadata.set(
+ XMPDM.AUDIO_SAMPLE_RATE,
+ Integer.toString((int) rate));
+ }
+ int bits = audioFormat.getSampleSizeInBits();
+ if (bits != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("bits", String.valueOf(bits));
+ if (bits == 8) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
+ } else if (bits == 16) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
+ } else if (bits == 32) {
+ metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
+ }
+ }
+ metadata.set("encoding", audioFormat.getEncoding().toString());
+
+ // Javadoc suggests that some of the following properties might
+ // be available, but I had no success in finding any:
+
+ // "duration" Long playback duration of the file in microseconds
+ // "author" String name of the author of this file
+ // "title" String title of this file
+ // "copyright" String copyright message
+ // "date" Date date of the recording or release
+ // "comment" String an arbitrary text
+
+ addMetadata(metadata, fileFormat.properties());
+ addMetadata(metadata, audioFormat.properties());
+ } catch (UnsupportedAudioFileException e) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private void addMetadata(Metadata metadata, Map<String, Object> properties) {
+ if (properties != null) {
+ for (Entry<String, Object> entry : properties.entrySet()) {
+ Object value = entry.getValue();
+ if (value != null) {
+ metadata.set(entry.getKey(), value.toString());
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
index c777287..656d1aa 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
@@ -1,121 +1,121 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.sound.midi.InvalidMidiDataException;
-import javax.sound.midi.MetaMessage;
-import javax.sound.midi.MidiMessage;
-import javax.sound.midi.MidiSystem;
-import javax.sound.midi.Patch;
-import javax.sound.midi.Sequence;
-import javax.sound.midi.Track;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-public class MidiParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 6343278584336189432L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("x-midi"),
- MediaType.audio("midi"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- // MidiSystem expects the stream to support the mark feature
- InputStream buffered = new BufferedInputStream(stream);
- try {
- Sequence sequence = MidiSystem.getSequence(buffered);
-
- Track[] tracks = sequence.getTracks();
- metadata.set("tracks", String.valueOf(tracks.length));
- // TODO: Use XMPDM.TRACKS?
-
- Patch[] patches = sequence.getPatchList();
- metadata.set("patches", String.valueOf(patches.length));
-
- float type = sequence.getDivisionType();
- if (type == Sequence.PPQ) {
- metadata.set("divisionType", "PPQ");
- } else if (type == Sequence.SMPTE_24) {
- metadata.set("divisionType", "SMPTE_24");
- } else if (type == Sequence.SMPTE_25) {
- metadata.set("divisionType", "SMPTE_25");
- } else if (type == Sequence.SMPTE_30) {
- metadata.set("divisionType", "SMPTE_30");
- } else if (type == Sequence.SMPTE_30DROP) {
- metadata.set("divisionType", "SMPTE_30DROP");
- } else if (type == Sequence.SMPTE_24) {
- metadata.set("divisionType", String.valueOf(type));
- }
-
- for (Track track : tracks) {
- xhtml.startElement("p");
- for (int i = 0; i < track.size(); i++) {
- MidiMessage message = track.get(i).getMessage();
- if (message instanceof MetaMessage) {
- MetaMessage meta = (MetaMessage) message;
- // Types 1-15 are reserved for text events
- if (meta.getType() >= 1 && meta.getType() <= 15) {
- // FIXME: What's the encoding?
- xhtml.characters(
- new String(meta.getData(), ISO_8859_1));
- }
- }
- }
- xhtml.endElement("p");
- }
- } catch (InvalidMidiDataException ignore) {
- // There is no way to know whether this exception was
- // caused by the document being corrupted or by the format
- // just being unsupported. So we do nothing.
- }
-
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
+import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+public class MidiParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 6343278584336189432L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("x-midi"),
+ MediaType.audio("midi"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // MidiSystem expects the stream to support the mark feature
+ InputStream buffered = new BufferedInputStream(stream);
+ try {
+ Sequence sequence = MidiSystem.getSequence(buffered);
+
+ Track[] tracks = sequence.getTracks();
+ metadata.set("tracks", String.valueOf(tracks.length));
+ // TODO: Use XMPDM.TRACKS?
+
+ Patch[] patches = sequence.getPatchList();
+ metadata.set("patches", String.valueOf(patches.length));
+
+ float type = sequence.getDivisionType();
+ if (type == Sequence.PPQ) {
+ metadata.set("divisionType", "PPQ");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", "SMPTE_24");
+ } else if (type == Sequence.SMPTE_25) {
+ metadata.set("divisionType", "SMPTE_25");
+ } else if (type == Sequence.SMPTE_30) {
+ metadata.set("divisionType", "SMPTE_30");
+ } else if (type == Sequence.SMPTE_30DROP) {
+ metadata.set("divisionType", "SMPTE_30DROP");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", String.valueOf(type));
+ }
+
+ for (Track track : tracks) {
+ xhtml.startElement("p");
+ for (int i = 0; i < track.size(); i++) {
+ MidiMessage message = track.get(i).getMessage();
+ if (message instanceof MetaMessage) {
+ MetaMessage meta = (MetaMessage) message;
+ // Types 1-15 are reserved for text events
+ if (meta.getType() >= 1 && meta.getType() <= 15) {
+ // FIXME: What's the encoding?
+ xhtml.characters(
+ new String(meta.getData(), ISO_8859_1));
+ }
+ }
+ }
+ xhtml.endElement("p");
+ }
+ } catch (InvalidMidiDataException ignore) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
+ }
+
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
index d8a6539..c207e0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
@@ -1,111 +1,111 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.font;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.fontbox.ttf.NameRecord;
-import org.apache.fontbox.ttf.NamingTable;
-import org.apache.fontbox.ttf.TTFParser;
-import org.apache.fontbox.ttf.TrueTypeFont;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for TrueType font files (TTF).
- */
-public class TrueTypeParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 44788554612243032L;
-
- private static final MediaType TYPE =
- MediaType.application("x-font-ttf");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(TYPE);
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- TikaInputStream tis = TikaInputStream.cast(stream);
-
- // Ask FontBox to parse the file for us
- TrueTypeFont font;
- TTFParser parser = new TTFParser();
- if (tis != null && tis.hasFile()) {
- font = parser.parse(tis.getFile());
- } else {
- font = parser.parse(stream);
- }
-
- // Report the details of the font
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- metadata.set(TikaCoreProperties.CREATED,
- font.getHeader().getCreated());
- metadata.set(TikaCoreProperties.MODIFIED,
- font.getHeader().getModified());
- metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
- Float.toString(font.getHeader().getVersion()));
-
- // Pull out the naming info
- NamingTable fontNaming = font.getNaming();
- for (NameRecord nr : fontNaming.getNameRecords()) {
- if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
- metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
- }
- if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
- metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
- }
- if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
- metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
- metadata.set(TikaCoreProperties.TITLE, nr.getString());
- }
- if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
- metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
- }
- if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
- metadata.set("Copyright", nr.getString());
- }
- if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
- metadata.set("Trademark", nr.getString());
- }
- }
-
- // For now, we only output metadata, no textual contents
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.NameRecord;
+import org.apache.fontbox.ttf.NamingTable;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for TrueType font files (TTF).
+ */
+public class TrueTypeParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 44788554612243032L;
+
+ private static final MediaType TYPE =
+ MediaType.application("x-font-ttf");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TikaInputStream tis = TikaInputStream.cast(stream);
+
+ // Ask FontBox to parse the file for us
+ TrueTypeFont font;
+ TTFParser parser = new TTFParser();
+ if (tis != null && tis.hasFile()) {
+ font = parser.parse(tis.getFile());
+ } else {
+ font = parser.parse(stream);
+ }
+
+ // Report the details of the font
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ metadata.set(TikaCoreProperties.CREATED,
+ font.getHeader().getCreated());
+ metadata.set(TikaCoreProperties.MODIFIED,
+ font.getHeader().getModified());
+ metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
+ Float.toString(font.getHeader().getVersion()));
+
+ // Pull out the naming info
+ NamingTable fontNaming = font.getNaming();
+ for (NameRecord nr : fontNaming.getNameRecords()) {
+ if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
+ metadata.set(TikaCoreProperties.TITLE, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
+ metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
+ metadata.set("Copyright", nr.getString());
+ }
+ if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
+ metadata.set("Trademark", nr.getString());
+ }
+ }
+
+ // For now, we only output metadata, no textual contents
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 39044d3..bf29d0b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -1,562 +1,562 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DecimalFormat;
-import java.text.DecimalFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import com.drew.imaging.jpeg.JpegMetadataReader;
-import com.drew.imaging.jpeg.JpegProcessingException;
-import com.drew.imaging.riff.RiffProcessingException;
-import com.drew.imaging.tiff.TiffMetadataReader;
-import com.drew.imaging.tiff.TiffProcessingException;
-import com.drew.imaging.webp.WebpMetadataReader;
-import com.drew.lang.ByteArrayReader;
-import com.drew.lang.GeoLocation;
-import com.drew.lang.Rational;
-import com.drew.metadata.Directory;
-import com.drew.metadata.MetadataException;
-import com.drew.metadata.Tag;
-import com.drew.metadata.exif.ExifIFD0Directory;
-import com.drew.metadata.exif.ExifReader;
-import com.drew.metadata.exif.ExifSubIFDDirectory;
-import com.drew.metadata.exif.ExifThumbnailDirectory;
-import com.drew.metadata.exif.GpsDirectory;
-import com.drew.metadata.iptc.IptcDirectory;
-import com.drew.metadata.jpeg.JpegCommentDirectory;
-import com.drew.metadata.jpeg.JpegDirectory;
-import org.apache.commons.io.IOUtils;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.IPTC;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.w3c.dom.Document;
-import org.xml.sax.SAXException;
-
-/**
- * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
- * to read EXIF and IPTC image metadata and map to Tika fields.
- * <p/>
- * As of 2.4.0 the library supports jpeg and tiff.
- * As of 2.8.0 the library supports webp.
- */
-public class ImageMetadataExtractor {
-
- private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
- private final Metadata metadata;
- private DirectoryHandler[] handlers;
-
- /**
- * @param metadata to extract to, using default directory handlers
- */
- public ImageMetadataExtractor(Metadata metadata) {
- this(metadata,
- new CopyUnknownFieldsHandler(),
- new JpegCommentHandler(),
- new ExifHandler(),
- new DimensionsHandler(),
- new GeotagHandler(),
- new IptcHandler()
- );
- }
-
- /**
- * @param metadata to extract to
- * @param handlers handlers in order, note that handlers may override values from earlier handlers
- */
- public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
- this.metadata = metadata;
- this.handlers = handlers;
- }
-
- private static String trimPixels(String s) {
- //if height/width appears as "100 pixels", trim " pixels"
- if (s != null) {
- int i = s.lastIndexOf(" pixels");
- s = s.substring(0, i);
- }
- return s;
- }
-
- public void parseJpeg(File file)
- throws IOException, SAXException, TikaException {
- try {
- com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
- handle(jpegMetadata);
- } catch (JpegProcessingException e) {
- throw new TikaException("Can't read JPEG metadata", e);
- } catch (MetadataException e) {
- throw new TikaException("Can't read JPEG metadata", e);
- }
- }
-
- public void parseTiff(File file)
- throws IOException, SAXException, TikaException {
- try {
- com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
- handle(tiffMetadata);
- } catch (MetadataException e) {
- throw new TikaException("Can't read TIFF metadata", e);
- } catch (TiffProcessingException e) {
- throw new TikaException("Can't read TIFF metadata", e);
- }
- }
-
- public void parseWebP(File file) throws IOException, TikaException {
-
- try {
- com.drew.metadata.Metadata webPMetadata = new com.drew.metadata.Metadata();
- webPMetadata = WebpMetadataReader.readMetadata(file);
- handle(webPMetadata);
- } catch (IOException e) {
- throw e;
- } catch (RiffProcessingException e) {
- throw new TikaException("Can't process Riff data", e);
- } catch (MetadataException e) {
- throw new TikaException("Can't process Riff data", e);
- }
- }
-
- public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
- throws IOException, SAXException, TikaException {
- byte[] exif;
- if (needsExifHeader) {
- exif = new byte[length + 6];
- exif[0] = (byte) 'E';
- exif[1] = (byte) 'x';
- exif[2] = (byte) 'i';
- exif[3] = (byte) 'f';
- IOUtils.readFully(stream, exif, 6, length);
- } else {
- exif = new byte[length];
- IOUtils.readFully(stream, exif, 0, length);
- }
- parseRawExif(exif);
- }
-
- public void parseRawExif(byte[] exifData)
- throws IOException, SAXException, TikaException {
- com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
- ExifReader reader = new ExifReader();
- reader.extract(new ByteArrayReader(exifData), metadata, ExifReader.JPEG_SEGMENT_PREAMBLE.length());
-
- try {
- handle(metadata);
- } catch (MetadataException e) {
- throw new TikaException("Can't process the EXIF Data", e);
- }
- }
-
- public void parseRawXMP(byte[] xmpData)
- throws IOException, SAXException, TikaException {
- XMPMetadata xmp = null;
- try (InputStream decoded =
- new ByteArrayInputStream(xmpData)
- ) {
- Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
- if (dom != null) {
- xmp = new XMPMetadata(dom);
- }
- } catch (IOException|SAXException e) {
- //
- }
- if (xmp != null) {
- JempboxExtractor.extractDublinCore(xmp, metadata);
- JempboxExtractor.extractXMPMM(xmp, metadata);
- }
- }
-
- /**
- * Copies extracted tags to tika metadata using registered handlers.
- *
- * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
- * @throws MetadataException This method does not handle exceptions from Metadata Extractor
- */
- protected void handle(com.drew.metadata.Metadata metadataExtractor)
- throws MetadataException {
- handle(metadataExtractor.getDirectories().iterator());
- }
-
- /**
- * Copies extracted tags to tika metadata using registered handlers.
- *
- * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
- * @throws MetadataException This method does not handle exceptions from Metadata Extractor
- */
- protected void handle(Iterator<Directory> directories) throws MetadataException {
- while (directories.hasNext()) {
- Directory directory = directories.next();
- for (DirectoryHandler handler : handlers) {
- if (handler.supports(directory.getClass())) {
- handler.handle(directory, metadata);
- }
- }
- }
- }
-
- /**
- * Reads one or more type of Metadata Extractor fields.
- */
- static interface DirectoryHandler {
- /**
- * @param directoryType A Metadata Extractor directory class
- * @return true if the directory type is supported by this handler
- */
- boolean supports(Class<? extends Directory> directoryType);
-
- /**
- * @param directory extracted tags
- * @param metadata current tika metadata
- * @throws MetadataException typically field extraction error, aborts all further extraction
- */
- void handle(Directory directory, Metadata metadata)
- throws MetadataException;
- }
-
- /**
- * Mimics the behavior from TIKA-314 of copying all extracted tags
- * to tika metadata using field names from Metadata Extractor.
- */
- static class CopyAllFieldsHandler implements DirectoryHandler {
- public boolean supports(Class<? extends Directory> directoryType) {
- return true;
- }
-
- public void handle(Directory directory, Metadata metadata)
- throws MetadataException {
- if (directory.getTags() != null) {
- for (Tag tag : directory.getTags()) {
- metadata.set(tag.getTagName(), tag.getDescription());
- }
- }
- }
- }
-
- /**
- * Copies all fields regardless of directory, if the tag name
- * is not identical to a known Metadata field name.
- * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
- */
- static class CopyUnknownFieldsHandler implements DirectoryHandler {
- public boolean supports(Class<? extends Directory> directoryType) {
- return true;
- }
-
- public void handle(Directory directory, Metadata metadata)
- throws MetadataException {
- if (directory.getTags() != null) {
- for (Tag tag : directory.getTags()) {
- String name = tag.getTagName();
- if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
- String value = tag.getDescription().trim();
- if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
- value = Boolean.TRUE.toString();
- } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
- value = Boolean.FALSE.toString();
- }
- metadata.set(name, value);
- }
- }
- }
- }
- }
-
- /**
- * Basic image properties for TIFF and JPEG, at least.
- */
- static class DimensionsHandler implements DirectoryHandler {
- private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
-
- public boolean supports(Class<? extends Directory> directoryType) {
- return directoryType == JpegDirectory.class ||
- directoryType == ExifSubIFDDirectory.class ||
- directoryType == ExifThumbnailDirectory.class ||
- directoryType == ExifIFD0Directory.class;
- }
-
- public void handle(Directory directory, Metadata metadata) throws MetadataException {
- // The test TIFF has width and height stored as follows according to exiv2
- //Exif.Image.ImageWidth Short 1 100
- //Exif.Image.ImageLength Short 1 75
- // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
- set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
- set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
- // Bits per sample, two methods of extracting, exif overrides jpeg
- set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
- set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
- // Straightforward
- set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
- }
-
- private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
- if (directory.containsTag(extractTag)) {
- Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
- if (m.matches()) {
- metadata.set(metadataField, m.group(1));
- }
- }
- }
- }
-
- static class JpegCommentHandler implements DirectoryHandler {
- public boolean supports(Class<? extends Directory> directoryType) {
- return directoryType == JpegCommentDirectory.class;
- }
-
- public void handle(Directory directory, Metadata metadata) throws MetadataException {
- if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
- metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_COMMENT));
- }
- }
- }
-
- static class ExifHandler implements DirectoryHandler {
- // There's a new ExifHandler for each file processed, so this is thread safe
- private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ = new ThreadLocal<SimpleDateFormat>() {
- @Override
- protected SimpleDateFormat initialValue() {
- return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
- }
- };
-
- public boolean supports(Class<? extends Directory> directoryType) {
- return directoryType == ExifIFD0Directory.class ||
- directoryType == ExifSubIFDDirectory.class;
- }
-
- public void handle(Directory directory, Metadata metadata) {
- try {
- handleDateTags(directory, metadata);
- handlePhotoTags(directory, metadata);
- handleCommentTags(directory, metadata);
- } catch (MetadataException e) {
- // ignore date parse errors and proceed with other tags
- }
- }
-
- /**
- * EXIF may contain image description, although with undefined encoding.
- * Use IPTC for other annotation fields, and XMP for unicode support.
- */
- public void handleCommentTags(Directory directory, Metadata metadata) {
- if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
- directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
- metadata.set(TikaCoreProperties.DESCRIPTION,
- directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
- }
- }
-
- /**
- * Maps common TIFF and EXIF tags onto the Tika
- * TIFF image metadata namespace.
- */
- public void handlePhotoTags(Directory directory, Metadata metadata) {
- if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
- Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
- if (exposure instanceof Rational) {
- metadata.set(Metadata.EXPOSURE_TIME, ((Rational) exposure).doubleValue());
- } else {
- metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
- }
- }
-
- if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
- String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
- if (flash != null) {
- if (flash.contains("Flash fired")) {
- metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
- } else if (flash.contains("Flash did not fire")) {
- metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
- } else {
- metadata.set(Metadata.FLASH_FIRED, flash);
- }
- }
- }
-
- if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
- Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
- if (fnumber instanceof Rational) {
- metadata.set(Metadata.F_NUMBER, ((Rational) fnumber).doubleValue());
- } else {
- metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
- }
- }
-
- if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
- Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
- if (length instanceof Rational) {
- metadata.set(Metadata.FOCAL_LENGTH, ((Rational) length).doubleValue());
- } else {
- metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
- }
- }
-
- if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
- metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
- }
-
- if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
- metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
- }
- if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
- metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
- }
-
- if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
- Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
- if (length instanceof Integer) {
- metadata.set(Metadata.ORIENTATION, Integer.toString((Integer) length));
- } else {
- metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
- }
- }
-
- if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
- metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
- }
-
- if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
- Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
- if (resolution instanceof Rational) {
- metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational) resolution).doubleValue());
- } else {
- metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
- }
- }
- if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
- Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
- if (resolution instanceof Rational) {
- metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational) resolution).doubleValue());
- } else {
- metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
- }
- }
- if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
- metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
- }
- if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)) {
- metadata.set(Metadata.IMAGE_WIDTH,
- trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
- }
- if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
- metadata.set(Metadata.IMAGE_LENGTH,
- trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
- }
- }
-
- /**
- * Maps exif dates to metadata fields.
- */
- public void handleDateTags(Directory directory, Metadata metadata)
- throws MetadataException {
- //TODO: should we try to process ExifSubIFDDirectory.TAG_TIME_ZONE_OFFSET
- //if it exists?
- // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
- Date original = null;
- if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
- original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
- // Unless we have GPS time we don't know the time zone so date must be set
- // as ISO 8601 datetime without timezone suffix (no Z or +/-)
- if (original != null) {
- String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata Extractor uses
- metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
- metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
- }
- }
- if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
- Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
- if (datetime != null) {
- String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(datetime);
- metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
- // If Date/Time Original does not exist this might be creation date
- if (metadata.get(TikaCoreProperties.CREATED) == null) {
- metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
- }
- }
- }
- }
- }
-
- /**
- * Reads image comments, originally TIKA-472.
- * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
- */
- static class IptcHandler implements DirectoryHandler {
- public boolean supports(Class<? extends Directory> directoryType) {
- return directoryType == IptcDirectory.class;
- }
-
- public void handle(Directory directory, Metadata metadata)
- throws MetadataException {
- if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
- String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
- for (String k : keywords) {
- metadata.add(TikaCoreProperties.KEYWORDS, k);
- }
- }
- if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
- metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
- } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
- metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
- }
- if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
- metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
- metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
- }
- if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
- metadata.set(TikaCoreProperties.DESCRIPTION,
- // Looks like metadata extractor returns IPTC newlines as a single carriage return,
- // but the exiv2 command does not so we change to line feed here because that is less surprising to users
- directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
- }
- }
- }
-
- /**
- * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
- */
- static class GeotagHandler implements DirectoryHandler {
- public boolean supports(Class<? extends Directory> directoryType) {
- return directoryType == GpsDirectory.class;
- }
-
- public void handle(Directory directory, Metadata metadata) throws MetadataException {
- GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
- if (geoLocation != null) {
- DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
- new DecimalFormatSymbols(Locale.ENGLISH));
- metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(geoLocation.getLatitude()));
- metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(geoLocation.getLongitude()));
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.imaging.jpeg.JpegProcessingException;
+import com.drew.imaging.riff.RiffProcessingException;
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.imaging.webp.WebpMetadataReader;
+import com.drew.lang.ByteArrayReader;
+import com.drew.lang.GeoLocation;
+import com.drew.lang.Rational;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifReader;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.exif.ExifThumbnailDirectory;
+import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.iptc.IptcDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import com.drew.metadata.jpeg.JpegDirectory;
+import org.apache.commons.io.IOUtils;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.IPTC;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
+ * to read EXIF and IPTC image metadata and map to Tika fields.
+ * <p/>
+ * As of 2.4.0 the library supports jpeg and tiff.
+ * As of 2.8.0 the library supports webp.
+ */
+public class ImageMetadataExtractor {
+
+ private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
+ private final Metadata metadata;
+ private DirectoryHandler[] handlers;
+
+ /**
+ * @param metadata to extract to, using default directory handlers
+ */
+ public ImageMetadataExtractor(Metadata metadata) {
+ this(metadata,
+ new CopyUnknownFieldsHandler(),
+ new JpegCommentHandler(),
+ new ExifHandler(),
+ new DimensionsHandler(),
+ new GeotagHandler(),
+ new IptcHandler()
+ );
+ }
+
+ /**
+ * @param metadata to extract to
+ * @param handlers handlers in order, note that handlers may override values from earlier handlers
+ */
+ public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
+ this.metadata = metadata;
+ this.handlers = handlers;
+ }
+
+ private static String trimPixels(String s) {
+ //if height/width appears as "100 pixels", trim " pixels"
+ if (s != null) {
+ int i = s.lastIndexOf(" pixels");
+ s = s.substring(0, i);
+ }
+ return s;
+ }
+
+ public void parseJpeg(File file)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
+ handle(jpegMetadata);
+ } catch (JpegProcessingException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ }
+ }
+
+ public void parseTiff(File file)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
+ handle(tiffMetadata);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ } catch (TiffProcessingException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ }
+ }
+
+ public void parseWebP(File file) throws IOException, TikaException {
+
+ try {
+ com.drew.metadata.Metadata webPMetadata = new com.drew.metadata.Metadata();
+ webPMetadata = WebpMetadataReader.readMetadata(file);
+ handle(webPMetadata);
+ } catch (IOException e) {
+ throw e;
+ } catch (RiffProcessingException e) {
+ throw new TikaException("Can't process Riff data", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process Riff data", e);
+ }
+ }
+
+ public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
+ throws IOException, SAXException, TikaException {
+ byte[] exif;
+ if (needsExifHeader) {
+ exif = new byte[length + 6];
+ exif[0] = (byte) 'E';
+ exif[1] = (byte) 'x';
+ exif[2] = (byte) 'i';
+ exif[3] = (byte) 'f';
+ IOUtils.readFully(stream, exif, 6, length);
+ } else {
+ exif = new byte[length];
+ IOUtils.readFully(stream, exif, 0, length);
+ }
+ parseRawExif(exif);
+ }
+
+ public void parseRawExif(byte[] exifData)
+ throws IOException, SAXException, TikaException {
+ com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+ ExifReader reader = new ExifReader();
+ reader.extract(new ByteArrayReader(exifData), metadata, ExifReader.JPEG_SEGMENT_PREAMBLE.length());
+
+ try {
+ handle(metadata);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process the EXIF Data", e);
+ }
+ }
+
+ public void parseRawXMP(byte[] xmpData)
+ throws IOException, SAXException, TikaException {
+ XMPMetadata xmp = null;
+ try (InputStream decoded =
+ new ByteArrayInputStream(xmpData)
+ ) {
+ Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
+ if (dom != null) {
+ xmp = new XMPMetadata(dom);
+ }
+ } catch (IOException|SAXException e) {
+ //
+ }
+ if (xmp != null) {
+ JempboxExtractor.extractDublinCore(xmp, metadata);
+ JempboxExtractor.extractXMPMM(xmp, metadata);
+ }
+ }
+
+ /**
+ * Copies extracted tags to tika metadata using registered handlers.
+ *
+ * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
+ * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+ */
+ protected void handle(com.drew.metadata.Metadata metadataExtractor)
+ throws MetadataException {
+ handle(metadataExtractor.getDirectories().iterator());
+ }
+
+ /**
+ * Copies extracted tags to tika metadata using registered handlers.
+ *
+ * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
+ * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+ */
+ protected void handle(Iterator<Directory> directories) throws MetadataException {
+ while (directories.hasNext()) {
+ Directory directory = directories.next();
+ for (DirectoryHandler handler : handlers) {
+ if (handler.supports(directory.getClass())) {
+ handler.handle(directory, metadata);
+ }
+ }
+ }
+ }
+
+ /**
+ * Reads one or more type of Metadata Extractor fields.
+ */
+ static interface DirectoryHandler {
+ /**
+ * @param directoryType A Metadata Extractor directory class
+ * @return true if the directory type is supported by this handler
+ */
+ boolean supports(Class<? extends Directory> directoryType);
+
+ /**
+ * @param directory extracted tags
+ * @param metadata current tika metadata
+ * @throws MetadataException typically field extraction error, aborts all further extraction
+ */
+ void handle(Directory directory, Metadata metadata)
+ throws MetadataException;
+ }
+
+ /**
+ * Mimics the behavior from TIKA-314 of copying all extracted tags
+ * to tika metadata using field names from Metadata Extractor.
+ */
+ static class CopyAllFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.getTags() != null) {
+ for (Tag tag : directory.getTags()) {
+ metadata.set(tag.getTagName(), tag.getDescription());
+ }
+ }
+ }
+ }
+
+ /**
+ * Copies all fields regardless of directory, if the tag name
+ * is not identical to a known Metadata field name.
+ * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
+ */
+ static class CopyUnknownFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.getTags() != null) {
+ for (Tag tag : directory.getTags()) {
+ String name = tag.getTagName();
+ if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
+ String value = tag.getDescription().trim();
+ if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+ value = Boolean.TRUE.toString();
+ } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+ value = Boolean.FALSE.toString();
+ }
+ metadata.set(name, value);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Basic image properties for TIFF and JPEG, at least.
+ */
+ static class DimensionsHandler implements DirectoryHandler {
+ private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegDirectory.class ||
+ directoryType == ExifSubIFDDirectory.class ||
+ directoryType == ExifThumbnailDirectory.class ||
+ directoryType == ExifIFD0Directory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ // The test TIFF has width and height stored as follows according to exiv2
+ //Exif.Image.ImageWidth Short 1 100
+ //Exif.Image.ImageLength Short 1 75
+ // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
+ set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+ set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+ // Bits per sample, two methods of extracting, exif overrides jpeg
+ set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
+ set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
+ // Straightforward
+ set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+ }
+
+ private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
+ if (directory.containsTag(extractTag)) {
+ Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
+ if (m.matches()) {
+ metadata.set(metadataField, m.group(1));
+ }
+ }
+ }
+ }
+
+ static class JpegCommentHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegCommentDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
+ metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_COMMENT));
+ }
+ }
+ }
+
+ static class ExifHandler implements DirectoryHandler {
+ // There's a new ExifHandler for each file processed, so this is thread safe
+ private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ = new ThreadLocal<SimpleDateFormat>() {
+ @Override
+ protected SimpleDateFormat initialValue() {
+ return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
+ }
+ };
+
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == ExifIFD0Directory.class ||
+ directoryType == ExifSubIFDDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) {
+ try {
+ handleDateTags(directory, metadata);
+ handlePhotoTags(directory, metadata);
+ handleCommentTags(directory, metadata);
+ } catch (MetadataException e) {
+ // ignore date parse errors and proceed with other tags
+ }
+ }
+
+ /**
+ * EXIF may contain image description, although with undefined encoding.
+ * Use IPTC for other annotation fields, and XMP for unicode support.
+ */
+ public void handleCommentTags(Directory directory, Metadata metadata) {
+ if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
+ directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
+ metadata.set(TikaCoreProperties.DESCRIPTION,
+ directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
+ }
+ }
+
+ /**
+ * Maps common TIFF and EXIF tags onto the Tika
+ * TIFF image metadata namespace.
+ */
+ public void handlePhotoTags(Directory directory, Metadata metadata) {
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
+ Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
+ if (exposure instanceof Rational) {
+ metadata.set(Metadata.EXPOSURE_TIME, ((Rational) exposure).doubleValue());
+ } else {
+ metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
+ String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
+ if (flash != null) {
+ if (flash.contains("Flash fired")) {
+ metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
+ } else if (flash.contains("Flash did not fire")) {
+ metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
+ } else {
+ metadata.set(Metadata.FLASH_FIRED, flash);
+ }
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
+ Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
+ if (fnumber instanceof Rational) {
+ metadata.set(Metadata.F_NUMBER, ((Rational) fnumber).doubleValue());
+ } else {
+ metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
+ Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
+ if (length instanceof Rational) {
+ metadata.set(Metadata.FOCAL_LENGTH, ((Rational) length).doubleValue());
+ } else {
+ metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
+ }
+ }
+
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
+ metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
+ metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
+ metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
+ Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
+ if (length instanceof Integer) {
+ metadata.set(Metadata.ORIENTATION, Integer.toString((Integer) length));
+ } else {
+ metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
+ }
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
+ metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
+ }
+
+ if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
+ Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
+ if (resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational) resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
+ Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
+ if (resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational) resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
+ metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
+ }
+ if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)) {
+ metadata.set(Metadata.IMAGE_WIDTH,
+ trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
+ }
+ if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
+ metadata.set(Metadata.IMAGE_LENGTH,
+ trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
+ }
+ }
+
+ /**
+ * Maps exif dates to metadata fields.
+ */
+ public void handleDateTags(Directory directory, Metadata metadata)
+ throws MetadataException {
+ //TODO: should we try to process ExifSubIFDDirectory.TAG_TIME_ZONE_OFFSET
+ //if it exists?
+ // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
+ Date original = null;
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
+ original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
+ // Unless we have GPS time we don't know the time zone so date must be set
+ // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+ if (original != null) {
+ String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata Extractor uses
+ metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+ metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+ }
+ }
+ if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
+ Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
+ if (datetime != null) {
+ String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(datetime);
+ metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
+ // If Date/Time Original does not exist this might be creation date
+ if (metadata.get(TikaCoreProperties.CREATED) == null) {
+ metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Reads image comments, originally TIKA-472.
+ * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+ */
+ static class IptcHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == IptcDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+ String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+ for (String k : keywords) {
+ metadata.add(TikaCoreProperties.KEYWORDS, k);
+ }
+ }
+ if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+ metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
+ } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+ metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
+ }
+ if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+ metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+ metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+ }
+ if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+ metadata.set(TikaCoreProperties.DESCRIPTION,
+ // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+ // but the exiv2 command does not so we change to line feed here because that is less surprising to users
+ directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
+ }
+ }
+ }
+
+ /**
+ * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+ */
+ static class GeotagHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == GpsDirectory.class;
+ }
+
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
+ if (geoLocation != null) {
+ DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
+ new DecimalFormatSymbols(Locale.ENGLISH));
+ metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(geoLocation.getLatitude()));
+ metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(geoLocation.getLongitude()));
+ }
+ }
+ }
+
+}
[33/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
index e42f542..8fd23eb 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
@@ -1,203 +1,203 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import javax.imageio.IIOException;
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-import javax.imageio.metadata.IIOMetadata;
-import javax.imageio.stream.ImageInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ImageParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = 7852529269245520335L;
-
- private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp");
- private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- CANONICAL_BMP_TYPE,
- JAVA_BMP_TYPE,
- MediaType.image("gif"),
- MediaType.image("png"),
- MediaType.image("vnd.wap.wbmp"),
- MediaType.image("x-icon"),
- MediaType.image("x-xcf"))));
-
- private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
- if (metadata.get(imageIOkey) != null) {
- metadata.set(tikaKey, metadata.get(imageIOkey));
- }
- }
-
- private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
- if (metadata.get(imageIOkey) != null) {
- String v = metadata.get(imageIOkey);
- if (v.endsWith(" ")) {
- v = v.substring(0, v.lastIndexOf(' '));
- }
- metadata.set(tikaProp, v);
- }
- }
-
- private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
- String[] names = imageMetadata.getMetadataFormatNames();
- if (names == null) {
- return;
- }
- for (String name : names) {
- loadNode(metadata, imageMetadata.getAsTree(name), "", false);
- }
- }
-
- private static void loadNode(
- Metadata metadata, Node node, String parents,
- boolean addThisNodeName) {
- if (addThisNodeName) {
- if (parents.length() > 0) {
- parents += " ";
- }
- parents += node.getNodeName();
- }
- NamedNodeMap map = node.getAttributes();
- if (map != null) {
-
- int length = map.getLength();
- if (length == 1) {
- metadata.add(parents, normalize(map.item(0).getNodeValue()));
- } else if (length > 1) {
- StringBuilder value = new StringBuilder();
- for (int i = 0; i < length; i++) {
- if (i > 0) {
- value.append(", ");
- }
- Node attr = map.item(i);
- value.append(attr.getNodeName());
- value.append("=");
- value.append(normalize(attr.getNodeValue()));
- }
- metadata.add(parents, value.toString());
- }
- }
-
- Node child = node.getFirstChild();
- while (child != null) {
- // print children recursively
- loadNode(metadata, child, parents, true);
- child = child.getNextSibling();
- }
- }
-
- private static String normalize(String value) {
- if (value != null) {
- value = value.trim();
- } else {
- value = "";
- }
- if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
- return Boolean.TRUE.toString();
- } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
- return Boolean.FALSE.toString();
- }
- return value;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type != null) {
- // Java has a different idea of the BMP mime type to
- // what the canonical one is, fix this up.
- if (CANONICAL_BMP_TYPE.toString().equals(type)) {
- type = JAVA_BMP_TYPE.toString();
- }
-
- try {
- Iterator<ImageReader> iterator =
- ImageIO.getImageReadersByMIMEType(type);
- if (iterator.hasNext()) {
- ImageReader reader = iterator.next();
- try {
- try (ImageInputStream imageStream = ImageIO.createImageInputStream(
- new CloseShieldInputStream(stream))) {
- reader.setInput(imageStream);
-
- metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
- metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
- metadata.set("height", Integer.toString(reader.getHeight(0)));
- metadata.set("width", Integer.toString(reader.getWidth(0)));
-
- loadMetadata(reader.getImageMetadata(0), metadata);
- }
- } finally {
- reader.dispose();
- }
- }
-
- // Translate certain Metadata tags from the ImageIO
- // specific namespace into the general Tika one
- setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
- setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
- setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
- } catch (IIOException e) {
- // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
- // which Tika will just ignore.
- if (!(e.getMessage() != null &&
- e.getMessage().equals("Unexpected block type 0!") &&
- type.equals("image/gif"))) {
- throw new TikaException(type + " parse error", e);
- }
- }
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7852529269245520335L;
+
+ private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp");
+ private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ CANONICAL_BMP_TYPE,
+ JAVA_BMP_TYPE,
+ MediaType.image("gif"),
+ MediaType.image("png"),
+ MediaType.image("vnd.wap.wbmp"),
+ MediaType.image("x-icon"),
+ MediaType.image("x-xcf"))));
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
+ if (metadata.get(imageIOkey) != null) {
+ metadata.set(tikaKey, metadata.get(imageIOkey));
+ }
+ }
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
+ if (metadata.get(imageIOkey) != null) {
+ String v = metadata.get(imageIOkey);
+ if (v.endsWith(" ")) {
+ v = v.substring(0, v.lastIndexOf(' '));
+ }
+ metadata.set(tikaProp, v);
+ }
+ }
+
+ private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+ String[] names = imageMetadata.getMetadataFormatNames();
+ if (names == null) {
+ return;
+ }
+ for (String name : names) {
+ loadNode(metadata, imageMetadata.getAsTree(name), "", false);
+ }
+ }
+
+ private static void loadNode(
+ Metadata metadata, Node node, String parents,
+ boolean addThisNodeName) {
+ if (addThisNodeName) {
+ if (parents.length() > 0) {
+ parents += " ";
+ }
+ parents += node.getNodeName();
+ }
+ NamedNodeMap map = node.getAttributes();
+ if (map != null) {
+
+ int length = map.getLength();
+ if (length == 1) {
+ metadata.add(parents, normalize(map.item(0).getNodeValue()));
+ } else if (length > 1) {
+ StringBuilder value = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ if (i > 0) {
+ value.append(", ");
+ }
+ Node attr = map.item(i);
+ value.append(attr.getNodeName());
+ value.append("=");
+ value.append(normalize(attr.getNodeValue()));
+ }
+ metadata.add(parents, value.toString());
+ }
+ }
+
+ Node child = node.getFirstChild();
+ while (child != null) {
+ // print children recursively
+ loadNode(metadata, child, parents, true);
+ child = child.getNextSibling();
+ }
+ }
+
+ private static String normalize(String value) {
+ if (value != null) {
+ value = value.trim();
+ } else {
+ value = "";
+ }
+ if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+ return Boolean.TRUE.toString();
+ } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+ return Boolean.FALSE.toString();
+ }
+ return value;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ // Java has a different idea of the BMP mime type to
+ // what the canonical one is, fix this up.
+ if (CANONICAL_BMP_TYPE.toString().equals(type)) {
+ type = JAVA_BMP_TYPE.toString();
+ }
+
+ try {
+ Iterator<ImageReader> iterator =
+ ImageIO.getImageReadersByMIMEType(type);
+ if (iterator.hasNext()) {
+ ImageReader reader = iterator.next();
+ try {
+ try (ImageInputStream imageStream = ImageIO.createImageInputStream(
+ new CloseShieldInputStream(stream))) {
+ reader.setInput(imageStream);
+
+ metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
+ metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
+ metadata.set("height", Integer.toString(reader.getHeight(0)));
+ metadata.set("width", Integer.toString(reader.getWidth(0)));
+
+ loadMetadata(reader.getImageMetadata(0), metadata);
+ }
+ } finally {
+ reader.dispose();
+ }
+ }
+
+ // Translate certain Metadata tags from the ImageIO
+ // specific namespace into the general Tika one
+ setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
+ } catch (IIOException e) {
+ // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
+ // which Tika will just ignore.
+ if (!(e.getMessage() != null &&
+ e.getMessage().equals("Unexpected block type 0!") &&
+ type.equals("image/gif"))) {
+ throw new TikaException(type + " parse error", e);
+ }
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
index c3b0fce..5238751 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/MetadataFields.java
@@ -1,84 +1,84 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.util.HashSet;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-
-/**
- * Knowns about all declared {@link Metadata} fields.
- * Didn't find this functionality anywhere so it was added for
- * ImageMetadataExtractor, but it can be generalized.
- */
-public abstract class MetadataFields {
-
- private static HashSet<String> known;
-
- static {
- known = new HashSet<String>();
- setKnownForClass(TikaCoreProperties.class);
- setKnownForClass(Metadata.class);
- }
-
- private static void setKnownForClass(Class<?> clazz) {
- Field[] fields = clazz.getFields();
- for (Field f : fields) {
- int mod = f.getModifiers();
- if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
- Class<?> c = f.getType();
- if (String.class.equals(c)) {
- try {
- String p = (String) f.get(null);
- if (p != null) {
- known.add(p);
- }
- } catch (IllegalArgumentException e) {
- e.printStackTrace();
- } catch (IllegalAccessException e) {
- e.printStackTrace();
- }
- }
- if (Property.class.isAssignableFrom(c)) {
- try {
- Property p = (Property) f.get(null);
- if (p != null) {
- known.add(p.getName());
- }
- } catch (IllegalArgumentException e) {
- e.printStackTrace();
- } catch (IllegalAccessException e) {
- e.printStackTrace();
- }
- }
- }
- }
- }
-
- public static boolean isMetadataField(String name) {
- return known.contains(name);
- }
-
- public static boolean isMetadataField(Property property) {
- return known.contains(property.getName());
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Knowns about all declared {@link Metadata} fields.
+ * Didn't find this functionality anywhere so it was added for
+ * ImageMetadataExtractor, but it can be generalized.
+ */
+public abstract class MetadataFields {
+
+ private static HashSet<String> known;
+
+ static {
+ known = new HashSet<String>();
+ setKnownForClass(TikaCoreProperties.class);
+ setKnownForClass(Metadata.class);
+ }
+
+ private static void setKnownForClass(Class<?> clazz) {
+ Field[] fields = clazz.getFields();
+ for (Field f : fields) {
+ int mod = f.getModifiers();
+ if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
+ Class<?> c = f.getType();
+ if (String.class.equals(c)) {
+ try {
+ String p = (String) f.get(null);
+ if (p != null) {
+ known.add(p);
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ if (Property.class.isAssignableFrom(c)) {
+ try {
+ Property p = (Property) f.get(null);
+ if (p != null) {
+ known.add(p.getName());
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ public static boolean isMetadataField(String name) {
+ return known.contains(name);
+ }
+
+ public static boolean isMetadataField(Property property) {
+ return known.contains(property.getName());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
index c98ce69..05dee1f 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class TiffParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -3941143576535464926L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.image("tiff"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
- new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
- new JempboxExtractor(metadata).parse(tis);
- } finally {
- tmp.dispose();
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TiffParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3941143576535464926L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("tiff"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
+ new JempboxExtractor(metadata).parse(tis);
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
index 247194e..7ec666c 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.jpeg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.image.ImageMetadataExtractor;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class JpegParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -1355028253756234603L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.image("jpeg"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
- new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
- new JempboxExtractor(metadata).parse(tis);
- } finally {
- tmp.dispose();
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JpegParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1355028253756234603L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("jpeg"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+ new JempboxExtractor(metadata).parse(tis);
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
index 03dc833..abc4235 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
@@ -1,252 +1,252 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
- * Currently, only the header is processed, not the raw audio data.
- */
-public class AudioFrame implements MP3Frame {
- /** Constant for the MPEG version 1. */
- public static final int MPEG_V1 = 3;
-
- /** Constant for the MPEG version 2. */
- public static final int MPEG_V2 = 2;
-
- /** Constant for the MPEG version 2.5. */
- public static final int MPEG_V2_5 = 0;
-
- /** Constant for audio layer 1. */
- public static final int LAYER_1 = 3;
-
- /** Constant for audio layer 2. */
- public static final int LAYER_2 = 2;
-
- /** Constant for audio layer 3. */
- public static final int LAYER_3 = 1;
-
- private final String version;
- private final int versionCode;
- private final int layer;
- private final int sampleRate;
- private final int channels;
- private final int bitRate;
- private final int length;
- private final float duration;
-
- public String getVersion() {
- return version;
- }
-
- /**
- * Get the sampling rate, in Hz
- */
- public int getSampleRate() {
- return sampleRate;
- }
-
- /**
- * Get the number of channels (1=mono, 2=stereo)
- */
- public int getChannels() {
- return channels;
- }
-
- /**
- * Get the version code.
- * @return the version code (one of the {@code MPEG} constants)
- */
- public int getVersionCode()
- {
- return versionCode;
- }
-
- /**
- * Get the audio layer code.
- * @return the audio layer (one of the {@code LAYER} constants)
- */
- public int getLayer()
- {
- return layer;
- }
-
- /**
- * Get the bit rate in bit per second.
- * @return the bit rate
- */
- public int getBitRate()
- {
- return bitRate;
- }
-
- /**
- * Returns the frame length in bytes.
- * @return the frame length
- */
- public int getLength()
- {
- return length;
- }
-
- /**
- * Returns the duration in milliseconds.
- * @return the duration
- */
- public float getDuration()
- {
- return duration;
- }
-
- /**
- * Does this appear to be a 4 byte audio frame header?
- */
- public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
- if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
- return false;
- }
- // Check for the magic 11 bits set at the start
- // Note - doesn't do a CRC check
- if (h1 == 0xff && (h2 & 0x60) == 0x60) {
- return true;
- }
- return false;
- }
-
- /**
- * @deprecated Use the constructor which is passed all values directly.
- */
- @Deprecated
- public AudioFrame(InputStream stream, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- this(-2, -2, -2, -2, stream);
- }
-
- /**
- * @deprecated Use the constructor which is passed all values directly.
- */
- @Deprecated
- public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
- throws IOException {
- if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
- h1 = in.read();
- h2 = in.read();
- h3 = in.read();
- h4 = in.read();
- }
-
- if (isAudioHeader(h1, h2, h3, h4)) {
- layer = (h2 >> 1) & 0x03;
- versionCode = (h2 >> 3) & 0x03;
- version = generateVersionStr(versionCode, layer);
-
- int rateCode = (h3 >> 2) & 0x03;
- int rate;
- switch (rateCode) {
- case 0:
- rate = 11025;
- break;
- case 1:
- rate = 12000;
- break;
- default:
- rate = 8000;
- }
- if (versionCode == MPEG_V2) {
- rate *= 2;
- } else if(versionCode == MPEG_V1) {
- rate *= 4;
- }
- sampleRate = rate;
-
- int chans = h4 & 0x192;
- if (chans < 3) {
- // Stereo, joint stereo, dual channel
- channels = 2;
- } else {
- channels = 1;
- }
- bitRate = 0;
- duration = 0;
- length = 0;
- } else {
- throw new IllegalArgumentException("Magic Audio Frame Header not found");
- }
- }
-
- /**
- *
- * Creates a new instance of {@code AudioFrame} and initializes all properties.
- * @param mpegVersion the code for the MPEG version
- * @param layer the code for the layer
- * @param bitRate the bit rate (in bps)
- * @param sampleRate the sample rate (in samples per second)
- * @param channels the number of channels
- * @param length the frame length (in bytes)
- * @param duration the duration of this frame (in milliseconds)
- */
- public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
- int channels, int length, float duration) {
- versionCode = mpegVersion;
- this.layer = layer;
- this.bitRate = bitRate;
- this.sampleRate = sampleRate;
- this.channels = channels;
- this.length = length;
- this.duration = duration;
- version = generateVersionStr(mpegVersion, layer);
- }
-
- /**
- * Generates a string for the version of this audio frame.
- * @param version the code for the MPEG version
- * @param layer the code for the layer
- * @return a string for the version
- */
- private static String generateVersionStr(int version, int layer) {
- StringBuilder buf = new StringBuilder(64);
- buf.append("MPEG 3 Layer ");
- if (layer == LAYER_3) {
- buf.append("III");
- } else if (layer == LAYER_2) {
- buf.append("II");
- } else if (layer == LAYER_1) {
- buf.append("I");
- } else {
- buf.append("(reserved)");
- }
-
- buf.append(" Version ");
- if (version == MPEG_V2_5) {
- buf.append("2.5");
- } else if(version == MPEG_V2) {
- buf.append("2");
- } else if(version == MPEG_V1) {
- buf.append("1");
- } else {
- buf.append("(reseved)");
- }
-
- return buf.toString();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+ /** Constant for the MPEG version 1. */
+ public static final int MPEG_V1 = 3;
+
+ /** Constant for the MPEG version 2. */
+ public static final int MPEG_V2 = 2;
+
+ /** Constant for the MPEG version 2.5. */
+ public static final int MPEG_V2_5 = 0;
+
+ /** Constant for audio layer 1. */
+ public static final int LAYER_1 = 3;
+
+ /** Constant for audio layer 2. */
+ public static final int LAYER_2 = 2;
+
+ /** Constant for audio layer 3. */
+ public static final int LAYER_3 = 1;
+
+ private final String version;
+ private final int versionCode;
+ private final int layer;
+ private final int sampleRate;
+ private final int channels;
+ private final int bitRate;
+ private final int length;
+ private final float duration;
+
+ public String getVersion() {
+ return version;
+ }
+
+ /**
+ * Get the sampling rate, in Hz
+ */
+ public int getSampleRate() {
+ return sampleRate;
+ }
+
+ /**
+ * Get the number of channels (1=mono, 2=stereo)
+ */
+ public int getChannels() {
+ return channels;
+ }
+
+ /**
+ * Get the version code.
+ * @return the version code (one of the {@code MPEG} constants)
+ */
+ public int getVersionCode()
+ {
+ return versionCode;
+ }
+
+ /**
+ * Get the audio layer code.
+ * @return the audio layer (one of the {@code LAYER} constants)
+ */
+ public int getLayer()
+ {
+ return layer;
+ }
+
+ /**
+ * Get the bit rate in bit per second.
+ * @return the bit rate
+ */
+ public int getBitRate()
+ {
+ return bitRate;
+ }
+
+ /**
+ * Returns the frame length in bytes.
+ * @return the frame length
+ */
+ public int getLength()
+ {
+ return length;
+ }
+
+ /**
+ * Returns the duration in milliseconds.
+ * @return the duration
+ */
+ public float getDuration()
+ {
+ return duration;
+ }
+
+ /**
+ * Does this appear to be a 4 byte audio frame header?
+ */
+ public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+ if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+ return false;
+ }
+ // Check for the magic 11 bits set at the start
+ // Note - doesn't do a CRC check
+ if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * @deprecated Use the constructor which is passed all values directly.
+ */
+ @Deprecated
+ public AudioFrame(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(-2, -2, -2, -2, stream);
+ }
+
+ /**
+ * @deprecated Use the constructor which is passed all values directly.
+ */
+ @Deprecated
+ public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+ throws IOException {
+ if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+ h1 = in.read();
+ h2 = in.read();
+ h3 = in.read();
+ h4 = in.read();
+ }
+
+ if (isAudioHeader(h1, h2, h3, h4)) {
+ layer = (h2 >> 1) & 0x03;
+ versionCode = (h2 >> 3) & 0x03;
+ version = generateVersionStr(versionCode, layer);
+
+ int rateCode = (h3 >> 2) & 0x03;
+ int rate;
+ switch (rateCode) {
+ case 0:
+ rate = 11025;
+ break;
+ case 1:
+ rate = 12000;
+ break;
+ default:
+ rate = 8000;
+ }
+ if (versionCode == MPEG_V2) {
+ rate *= 2;
+ } else if(versionCode == MPEG_V1) {
+ rate *= 4;
+ }
+ sampleRate = rate;
+
+ int chans = h4 & 0x192;
+ if (chans < 3) {
+ // Stereo, joint stereo, dual channel
+ channels = 2;
+ } else {
+ channels = 1;
+ }
+ bitRate = 0;
+ duration = 0;
+ length = 0;
+ } else {
+ throw new IllegalArgumentException("Magic Audio Frame Header not found");
+ }
+ }
+
+ /**
+ *
+ * Creates a new instance of {@code AudioFrame} and initializes all properties.
+ * @param mpegVersion the code for the MPEG version
+ * @param layer the code for the layer
+ * @param bitRate the bit rate (in bps)
+ * @param sampleRate the sample rate (in samples per second)
+ * @param channels the number of channels
+ * @param length the frame length (in bytes)
+ * @param duration the duration of this frame (in milliseconds)
+ */
+ public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
+ int channels, int length, float duration) {
+ versionCode = mpegVersion;
+ this.layer = layer;
+ this.bitRate = bitRate;
+ this.sampleRate = sampleRate;
+ this.channels = channels;
+ this.length = length;
+ this.duration = duration;
+ version = generateVersionStr(mpegVersion, layer);
+ }
+
+ /**
+ * Generates a string for the version of this audio frame.
+ * @param version the code for the MPEG version
+ * @param layer the code for the layer
+ * @return a string for the version
+ */
+ private static String generateVersionStr(int version, int layer) {
+ StringBuilder buf = new StringBuilder(64);
+ buf.append("MPEG 3 Layer ");
+ if (layer == LAYER_3) {
+ buf.append("III");
+ } else if (layer == LAYER_2) {
+ buf.append("II");
+ } else if (layer == LAYER_1) {
+ buf.append("I");
+ } else {
+ buf.append("(reserved)");
+ }
+
+ buf.append(" Version ");
+ if (version == MPEG_V2_5) {
+ buf.append("2.5");
+ } else if(version == MPEG_V2) {
+ buf.append("2");
+ } else if(version == MPEG_V1) {
+ buf.append("1");
+ } else {
+ buf.append("(reseved)");
+ }
+
+ return buf.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
index 6f20c3c..b7d2d75 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
@@ -1,142 +1,142 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Takes an array of {@link ID3Tags} in preference order, and when asked for
- * a given tag, will return it from the first {@link ID3Tags} that has it.
- */
-public class CompositeTagHandler implements ID3Tags {
-
- private ID3Tags[] tags;
-
- public CompositeTagHandler(ID3Tags[] tags) {
- this.tags = tags;
- }
-
- public boolean getTagsPresent() {
- for (ID3Tags tag : tags) {
- if (tag.getTagsPresent()) {
- return true;
- }
- }
- return false;
- }
-
- public String getTitle() {
- for (ID3Tags tag : tags) {
- if (tag.getTitle() != null) {
- return tag.getTitle();
- }
- }
- return null;
- }
-
- public String getArtist() {
- for (ID3Tags tag : tags) {
- if (tag.getArtist() != null) {
- return tag.getArtist();
- }
- }
- return null;
- }
-
- public String getAlbum() {
- for (ID3Tags tag : tags) {
- if (tag.getAlbum() != null) {
- return tag.getAlbum();
- }
- }
- return null;
- }
-
- public String getComposer() {
- for (ID3Tags tag : tags) {
- if (tag.getComposer() != null) {
- return tag.getComposer();
- }
- }
- return null;
- }
-
- public String getYear() {
- for (ID3Tags tag : tags) {
- if (tag.getYear() != null) {
- return tag.getYear();
- }
- }
- return null;
- }
-
- public List<ID3Comment> getComments() {
- for (ID3Tags tag : tags) {
- List<ID3Comment> comments = tag.getComments();
- if (comments != null && comments.size() > 0) {
- return comments;
- }
- }
- return Collections.emptyList();
- }
-
- public String getGenre() {
- for (ID3Tags tag : tags) {
- if (tag.getGenre() != null) {
- return tag.getGenre();
- }
- }
- return null;
- }
-
- public String getTrackNumber() {
- for (ID3Tags tag : tags) {
- if (tag.getTrackNumber() != null) {
- return tag.getTrackNumber();
- }
- }
- return null;
- }
-
- public String getAlbumArtist() {
- for (ID3Tags tag : tags) {
- if (tag.getAlbumArtist() != null) {
- return tag.getAlbumArtist();
- }
- }
- return null;
- }
-
- public String getDisc() {
- for (ID3Tags tag : tags) {
- if (tag.getDisc() != null) {
- return tag.getDisc();
- }
- }
- return null;
- }
-
- public String getCompilation() {
- for (ID3Tags tag : tags) {
- if (tag.getCompilation() != null) {
- return tag.getCompilation();
- }
- }
- return null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+ private ID3Tags[] tags;
+
+ public CompositeTagHandler(ID3Tags[] tags) {
+ this.tags = tags;
+ }
+
+ public boolean getTagsPresent() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTagsPresent()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public String getTitle() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTitle() != null) {
+ return tag.getTitle();
+ }
+ }
+ return null;
+ }
+
+ public String getArtist() {
+ for (ID3Tags tag : tags) {
+ if (tag.getArtist() != null) {
+ return tag.getArtist();
+ }
+ }
+ return null;
+ }
+
+ public String getAlbum() {
+ for (ID3Tags tag : tags) {
+ if (tag.getAlbum() != null) {
+ return tag.getAlbum();
+ }
+ }
+ return null;
+ }
+
+ public String getComposer() {
+ for (ID3Tags tag : tags) {
+ if (tag.getComposer() != null) {
+ return tag.getComposer();
+ }
+ }
+ return null;
+ }
+
+ public String getYear() {
+ for (ID3Tags tag : tags) {
+ if (tag.getYear() != null) {
+ return tag.getYear();
+ }
+ }
+ return null;
+ }
+
+ public List<ID3Comment> getComments() {
+ for (ID3Tags tag : tags) {
+ List<ID3Comment> comments = tag.getComments();
+ if (comments != null && comments.size() > 0) {
+ return comments;
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ public String getGenre() {
+ for (ID3Tags tag : tags) {
+ if (tag.getGenre() != null) {
+ return tag.getGenre();
+ }
+ }
+ return null;
+ }
+
+ public String getTrackNumber() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTrackNumber() != null) {
+ return tag.getTrackNumber();
+ }
+ }
+ return null;
+ }
+
+ public String getAlbumArtist() {
+ for (ID3Tags tag : tags) {
+ if (tag.getAlbumArtist() != null) {
+ return tag.getAlbumArtist();
+ }
+ }
+ return null;
+ }
+
+ public String getDisc() {
+ for (ID3Tags tag : tags) {
+ if (tag.getDisc() != null) {
+ return tag.getDisc();
+ }
+ }
+ return null;
+ }
+
+ public String getCompilation() {
+ for (ID3Tags tag : tags) {
+ if (tag.getCompilation() != null) {
+ return tag.getCompilation();
+ }
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
index 6ee19db..98ef504 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
@@ -1,254 +1,254 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.util.List;
-
-/**
- * Interface that defines the common interface for ID3 tag parsers,
- * such as ID3v1 and ID3v2.3.
- * Implementations should return NULL if the file lacks a given
- * tag, or if the tag isn't defined for the version.
- *
- * Note that so far, only the ID3v1 core tags are listed here. In
- * future, we may wish to add more to cover the extra tags that
- * our ID3v2 handlers can produce.
- */
-public interface ID3Tags {
- /**
- * List of predefined genres.
- *
- * @see http://www.id3.org/id3v2-00
- */
- String[] GENRES = new String[] {
- /* 0 */ "Blues",
- /* 1 */ "Classic Rock",
- /* 2 */ "Country",
- /* 3 */ "Dance",
- /* 4 */ "Disco",
- /* 5 */ "Funk",
- /* 6 */ "Grunge",
- /* 7 */ "Hip-Hop",
- /* 8 */ "Jazz",
- /* 9 */ "Metal",
- /* 10 */ "New Age",
- /* 11 */ "Oldies",
- /* 12 */ "Other",
- /* 13 */ "Pop",
- /* 14 */ "R&B",
- /* 15 */ "Rap",
- /* 16 */ "Reggae",
- /* 17 */ "Rock",
- /* 18 */ "Techno",
- /* 19 */ "Industrial",
- /* 20 */ "Alternative",
- /* 21 */ "Ska",
- /* 22 */ "Death Metal",
- /* 23 */ "Pranks",
- /* 24 */ "Soundtrack",
- /* 25 */ "Euro-Techno",
- /* 26 */ "Ambient",
- /* 27 */ "Trip-Hop",
- /* 28 */ "Vocal",
- /* 29 */ "Jazz+Funk",
- /* 30 */ "Fusion",
- /* 31 */ "Trance",
- /* 32 */ "Classical",
- /* 33 */ "Instrumental",
- /* 34 */ "Acid",
- /* 35 */ "House",
- /* 36 */ "Game",
- /* 37 */ "Sound Clip",
- /* 38 */ "Gospel",
- /* 39 */ "Noise",
- /* 40 */ "AlternRock",
- /* 41 */ "Bass",
- /* 42 */ "Soul",
- /* 43 */ "Punk",
- /* 44 */ "Space",
- /* 45 */ "Meditative",
- /* 46 */ "Instrumental Pop",
- /* 47 */ "Instrumental Rock",
- /* 48 */ "Ethnic",
- /* 49 */ "Gothic",
- /* 50 */ "Darkwave",
- /* 51 */ "Techno-Industrial",
- /* 52 */ "Electronic",
- /* 53 */ "Pop-Folk",
- /* 54 */ "Eurodance",
- /* 55 */ "Dream",
- /* 56 */ "Southern Rock",
- /* 57 */ "Comedy",
- /* 58 */ "Cult",
- /* 59 */ "Gangsta",
- /* 60 */ "Top 40",
- /* 61 */ "Christian Rap",
- /* 62 */ "Pop/Funk",
- /* 63 */ "Jungle",
- /* 64 */ "Native American",
- /* 65 */ "Cabaret",
- /* 66 */ "New Wave",
- /* 67 */ "Psychadelic",
- /* 68 */ "Rave",
- /* 69 */ "Showtunes",
- /* 70 */ "Trailer",
- /* 71 */ "Lo-Fi",
- /* 72 */ "Tribal",
- /* 73 */ "Acid Punk",
- /* 74 */ "Acid Jazz",
- /* 75 */ "Polka",
- /* 76 */ "Retro",
- /* 77 */ "Musical",
- /* 78 */ "Rock & Roll",
- /* 79 */ "Hard Rock",
- /* 80 */ "Folk",
- /* 81 */ "Folk-Rock",
- /* 82 */ "National Folk",
- /* 83 */ "Swing",
- /* 84 */ "Fast Fusion",
- /* 85 */ "Bebob",
- /* 86 */ "Latin",
- /* 87 */ "Revival",
- /* 88 */ "Celtic",
- /* 89 */ "Bluegrass",
- /* 90 */ "Avantgarde",
- /* 91 */ "Gothic Rock",
- /* 92 */ "Progressive Rock",
- /* 93 */ "Psychedelic Rock",
- /* 94 */ "Symphonic Rock",
- /* 95 */ "Slow Rock",
- /* 96 */ "Big Band",
- /* 97 */ "Chorus",
- /* 98 */ "Easy Listening",
- /* 99 */ "Acoustic",
- /* 100 */ "Humour",
- /* 101 */ "Speech",
- /* 102 */ "Chanson",
- /* 103 */ "Opera",
- /* 104 */ "Chamber Music",
- /* 105 */ "Sonata",
- /* 106 */ "Symphony",
- /* 107 */ "Booty Bass",
- /* 108 */ "Primus",
- /* 109 */ "Porn Groove",
- /* 110 */ "Satire",
- /* 111 */ "Slow Jam",
- /* 112 */ "Club",
- /* 113 */ "Tango",
- /* 114 */ "Samba",
- /* 115 */ "Folklore",
- /* 116 */ "Ballad",
- /* 117 */ "Power Ballad",
- /* 118 */ "Rhythmic Soul",
- /* 119 */ "Freestyle",
- /* 120 */ "Duet",
- /* 121 */ "Punk Rock",
- /* 122 */ "Drum Solo",
- /* 123 */ "A capella",
- /* 124 */ "Euro-House",
- /* 125 */ "Dance Hall",
- /* sentinel */ ""
- };
-
- /**
- * Does the file contain this kind of tags?
- */
- boolean getTagsPresent();
-
- String getTitle();
-
- /**
- * The Artist for the track
- */
- String getArtist();
-
- /**
- * The Artist for the overall album / compilation of albums
- */
- String getAlbumArtist();
-
- String getAlbum();
-
- String getComposer();
-
- String getCompilation();
-
- /**
- * Retrieves the comments, if any.
- * Files may have more than one comment, but normally only
- * one with any language/description pair.
- */
- List<ID3Comment> getComments();
-
- String getGenre();
-
- String getYear();
-
- /**
- * The number of the track within the album / recording
- */
- String getTrackNumber();
-
- /**
- * The number of the disc this belongs to, within the set
- */
- String getDisc();
-
- /**
- * Represents a comments in ID3 (especially ID3 v2), where are
- * made up of several parts
- */
- public static class ID3Comment {
- private String language;
- private String description;
- private String text;
-
- /**
- * Creates an ID3 v1 style comment tag
- */
- public ID3Comment(String id3v1Text) {
- this.text = id3v1Text;
- }
- /**
- * Creates an ID3 v2 style comment tag
- */
- public ID3Comment(String language, String description, String text) {
- this.language = language;
- this.description = description;
- this.text = text;
- }
-
- /**
- * Gets the language, if present
- */
- public String getLanguage() {
- return language;
- }
- /**
- * Gets the description, if present
- */
- public String getDescription() {
- return description;
- }
- /**
- * Gets the text, if present
- */
- public String getText() {
- return text;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.List;
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ * such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ * tag, or if the tag isn't defined for the version.
+ *
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ * future, we may wish to add more to cover the extra tags that
+ * our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+ /**
+ * List of predefined genres.
+ *
+ * @see http://www.id3.org/id3v2-00
+ */
+ String[] GENRES = new String[] {
+ /* 0 */ "Blues",
+ /* 1 */ "Classic Rock",
+ /* 2 */ "Country",
+ /* 3 */ "Dance",
+ /* 4 */ "Disco",
+ /* 5 */ "Funk",
+ /* 6 */ "Grunge",
+ /* 7 */ "Hip-Hop",
+ /* 8 */ "Jazz",
+ /* 9 */ "Metal",
+ /* 10 */ "New Age",
+ /* 11 */ "Oldies",
+ /* 12 */ "Other",
+ /* 13 */ "Pop",
+ /* 14 */ "R&B",
+ /* 15 */ "Rap",
+ /* 16 */ "Reggae",
+ /* 17 */ "Rock",
+ /* 18 */ "Techno",
+ /* 19 */ "Industrial",
+ /* 20 */ "Alternative",
+ /* 21 */ "Ska",
+ /* 22 */ "Death Metal",
+ /* 23 */ "Pranks",
+ /* 24 */ "Soundtrack",
+ /* 25 */ "Euro-Techno",
+ /* 26 */ "Ambient",
+ /* 27 */ "Trip-Hop",
+ /* 28 */ "Vocal",
+ /* 29 */ "Jazz+Funk",
+ /* 30 */ "Fusion",
+ /* 31 */ "Trance",
+ /* 32 */ "Classical",
+ /* 33 */ "Instrumental",
+ /* 34 */ "Acid",
+ /* 35 */ "House",
+ /* 36 */ "Game",
+ /* 37 */ "Sound Clip",
+ /* 38 */ "Gospel",
+ /* 39 */ "Noise",
+ /* 40 */ "AlternRock",
+ /* 41 */ "Bass",
+ /* 42 */ "Soul",
+ /* 43 */ "Punk",
+ /* 44 */ "Space",
+ /* 45 */ "Meditative",
+ /* 46 */ "Instrumental Pop",
+ /* 47 */ "Instrumental Rock",
+ /* 48 */ "Ethnic",
+ /* 49 */ "Gothic",
+ /* 50 */ "Darkwave",
+ /* 51 */ "Techno-Industrial",
+ /* 52 */ "Electronic",
+ /* 53 */ "Pop-Folk",
+ /* 54 */ "Eurodance",
+ /* 55 */ "Dream",
+ /* 56 */ "Southern Rock",
+ /* 57 */ "Comedy",
+ /* 58 */ "Cult",
+ /* 59 */ "Gangsta",
+ /* 60 */ "Top 40",
+ /* 61 */ "Christian Rap",
+ /* 62 */ "Pop/Funk",
+ /* 63 */ "Jungle",
+ /* 64 */ "Native American",
+ /* 65 */ "Cabaret",
+ /* 66 */ "New Wave",
+ /* 67 */ "Psychadelic",
+ /* 68 */ "Rave",
+ /* 69 */ "Showtunes",
+ /* 70 */ "Trailer",
+ /* 71 */ "Lo-Fi",
+ /* 72 */ "Tribal",
+ /* 73 */ "Acid Punk",
+ /* 74 */ "Acid Jazz",
+ /* 75 */ "Polka",
+ /* 76 */ "Retro",
+ /* 77 */ "Musical",
+ /* 78 */ "Rock & Roll",
+ /* 79 */ "Hard Rock",
+ /* 80 */ "Folk",
+ /* 81 */ "Folk-Rock",
+ /* 82 */ "National Folk",
+ /* 83 */ "Swing",
+ /* 84 */ "Fast Fusion",
+ /* 85 */ "Bebob",
+ /* 86 */ "Latin",
+ /* 87 */ "Revival",
+ /* 88 */ "Celtic",
+ /* 89 */ "Bluegrass",
+ /* 90 */ "Avantgarde",
+ /* 91 */ "Gothic Rock",
+ /* 92 */ "Progressive Rock",
+ /* 93 */ "Psychedelic Rock",
+ /* 94 */ "Symphonic Rock",
+ /* 95 */ "Slow Rock",
+ /* 96 */ "Big Band",
+ /* 97 */ "Chorus",
+ /* 98 */ "Easy Listening",
+ /* 99 */ "Acoustic",
+ /* 100 */ "Humour",
+ /* 101 */ "Speech",
+ /* 102 */ "Chanson",
+ /* 103 */ "Opera",
+ /* 104 */ "Chamber Music",
+ /* 105 */ "Sonata",
+ /* 106 */ "Symphony",
+ /* 107 */ "Booty Bass",
+ /* 108 */ "Primus",
+ /* 109 */ "Porn Groove",
+ /* 110 */ "Satire",
+ /* 111 */ "Slow Jam",
+ /* 112 */ "Club",
+ /* 113 */ "Tango",
+ /* 114 */ "Samba",
+ /* 115 */ "Folklore",
+ /* 116 */ "Ballad",
+ /* 117 */ "Power Ballad",
+ /* 118 */ "Rhythmic Soul",
+ /* 119 */ "Freestyle",
+ /* 120 */ "Duet",
+ /* 121 */ "Punk Rock",
+ /* 122 */ "Drum Solo",
+ /* 123 */ "A capella",
+ /* 124 */ "Euro-House",
+ /* 125 */ "Dance Hall",
+ /* sentinel */ ""
+ };
+
+ /**
+ * Does the file contain this kind of tags?
+ */
+ boolean getTagsPresent();
+
+ String getTitle();
+
+ /**
+ * The Artist for the track
+ */
+ String getArtist();
+
+ /**
+ * The Artist for the overall album / compilation of albums
+ */
+ String getAlbumArtist();
+
+ String getAlbum();
+
+ String getComposer();
+
+ String getCompilation();
+
+ /**
+ * Retrieves the comments, if any.
+ * Files may have more than one comment, but normally only
+ * one with any language/description pair.
+ */
+ List<ID3Comment> getComments();
+
+ String getGenre();
+
+ String getYear();
+
+ /**
+ * The number of the track within the album / recording
+ */
+ String getTrackNumber();
+
+ /**
+ * The number of the disc this belongs to, within the set
+ */
+ String getDisc();
+
+ /**
+ * Represents a comments in ID3 (especially ID3 v2), where are
+ * made up of several parts
+ */
+ public static class ID3Comment {
+ private String language;
+ private String description;
+ private String text;
+
+ /**
+ * Creates an ID3 v1 style comment tag
+ */
+ public ID3Comment(String id3v1Text) {
+ this.text = id3v1Text;
+ }
+ /**
+ * Creates an ID3 v2 style comment tag
+ */
+ public ID3Comment(String language, String description, String text) {
+ this.language = language;
+ this.description = description;
+ this.text = text;
+ }
+
+ /**
+ * Gets the language, if present
+ */
+ public String getLanguage() {
+ return language;
+ }
+ /**
+ * Gets the description, if present
+ */
+ public String getDescription() {
+ return description;
+ }
+ /**
+ * Gets the text, if present
+ */
+ public String getText() {
+ return text;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
index 4d41fa3..2111356 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
@@ -1,183 +1,183 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-/**
- * This is used to parse ID3 Version 1 Tag information from an MP3 file,
- * if available.
- *
- * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
- */
-public class ID3v1Handler implements ID3Tags {
- private String title;
- private String artist;
- private String album;
- private String year;
- private ID3Comment comment;
- private String genre;
- private String trackNumber;
-
- boolean found = false;
-
- public ID3v1Handler(InputStream stream, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- this(LyricsHandler.getSuffix(stream, 128));
- }
-
- /**
- * Creates from the last 128 bytes of a stream.
- * @param tagData Must be the last 128 bytes
- */
- protected ID3v1Handler(byte[] tagData)
- throws IOException, SAXException, TikaException {
- if (tagData.length == 128
- && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
- found = true;
-
- title = getString(tagData, 3, 33);
- artist = getString(tagData, 33, 63);
- album = getString(tagData, 63, 93);
- year = getString(tagData, 93, 97);
-
- String commentStr = getString(tagData, 97, 127);
- comment = new ID3Comment(commentStr);
-
- int genreID = (int) tagData[127] & 0xff; // unsigned byte
- genre = GENRES[Math.min(genreID, GENRES.length - 1)];
-
- // ID3v1.1 Track addition
- // If the last two bytes of the comment field are zero and
- // non-zero, then the last byte is the track number
- if (tagData[125] == 0 && tagData[126] != 0) {
- int trackNum = (int) tagData[126] & 0xff;
- trackNumber = Integer.toString(trackNum);
- }
- }
- }
-
-
- public boolean getTagsPresent() {
- return found;
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getArtist() {
- return artist;
- }
-
- public String getAlbum() {
- return album;
- }
-
- public String getYear() {
- return year;
- }
-
- public List<ID3Comment> getComments() {
- return Arrays.asList(comment);
- }
-
- public String getGenre() {
- return genre;
- }
-
- public String getTrackNumber() {
- return trackNumber;
- }
-
- /**
- * ID3v1 doesn't have composers,
- * so returns null;
- */
- public String getComposer() {
- return null;
- }
-
- /**
- * ID3v1 doesn't have album-wide artists,
- * so returns null;
- */
- public String getAlbumArtist() {
- return null;
- }
-
- /**
- * ID3v1 doesn't have disc numbers,
- * so returns null;
- */
- public String getDisc() {
- return null;
- }
-
- /**
- * ID3v1 doesn't have compilations,
- * so returns null;
- */
- public String getCompilation() {
- return null;
- }
-
- /**
- * Returns the identified ISO-8859-1 substring from the given byte buffer.
- * The return value is the zero-terminated substring retrieved from
- * between the given start and end positions in the given byte buffer.
- * Extra whitespace (and control characters) from the beginning and the
- * end of the substring is removed.
- *
- * @param buffer byte buffer
- * @param start start index of the substring
- * @param end end index of the substring
- * @return the identified substring
- * @throws TikaException if the ISO-8859-1 encoding is not available
- */
- private static String getString(byte[] buffer, int start, int end)
- throws TikaException {
- // Find the zero byte that marks the end of the string
- int zero = start;
- while (zero < end && buffer[zero] != 0) {
- zero++;
- }
-
- // Skip trailing whitespace
- end = zero;
- while (start < end && buffer[end - 1] <= ' ') {
- end--;
- }
-
- // Skip leading whitespace
- while (start < end && buffer[start] <= ' ') {
- start++;
- }
-
- // Return the remaining substring
- return new String(buffer, start, end - start, ISO_8859_1);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private ID3Comment comment;
+ private String genre;
+ private String trackNumber;
+
+ boolean found = false;
+
+ public ID3v1Handler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(LyricsHandler.getSuffix(stream, 128));
+ }
+
+ /**
+ * Creates from the last 128 bytes of a stream.
+ * @param tagData Must be the last 128 bytes
+ */
+ protected ID3v1Handler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if (tagData.length == 128
+ && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+ found = true;
+
+ title = getString(tagData, 3, 33);
+ artist = getString(tagData, 33, 63);
+ album = getString(tagData, 63, 93);
+ year = getString(tagData, 93, 97);
+
+ String commentStr = getString(tagData, 97, 127);
+ comment = new ID3Comment(commentStr);
+
+ int genreID = (int) tagData[127] & 0xff; // unsigned byte
+ genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+ // ID3v1.1 Track addition
+ // If the last two bytes of the comment field are zero and
+ // non-zero, then the last byte is the track number
+ if (tagData[125] == 0 && tagData[126] != 0) {
+ int trackNum = (int) tagData[126] & 0xff;
+ trackNumber = Integer.toString(trackNum);
+ }
+ }
+ }
+
+
+ public boolean getTagsPresent() {
+ return found;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public List<ID3Comment> getComments() {
+ return Arrays.asList(comment);
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ /**
+ * ID3v1 doesn't have composers,
+ * so returns null;
+ */
+ public String getComposer() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have album-wide artists,
+ * so returns null;
+ */
+ public String getAlbumArtist() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have disc numbers,
+ * so returns null;
+ */
+ public String getDisc() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have compilations,
+ * so returns null;
+ */
+ public String getCompilation() {
+ return null;
+ }
+
+ /**
+ * Returns the identified ISO-8859-1 substring from the given byte buffer.
+ * The return value is the zero-terminated substring retrieved from
+ * between the given start and end positions in the given byte buffer.
+ * Extra whitespace (and control characters) from the beginning and the
+ * end of the substring is removed.
+ *
+ * @param buffer byte buffer
+ * @param start start index of the substring
+ * @param end end index of the substring
+ * @return the identified substring
+ * @throws TikaException if the ISO-8859-1 encoding is not available
+ */
+ private static String getString(byte[] buffer, int start, int end)
+ throws TikaException {
+ // Find the zero byte that marks the end of the string
+ int zero = start;
+ while (zero < end && buffer[zero] != 0) {
+ zero++;
+ }
+
+ // Skip trailing whitespace
+ end = zero;
+ while (start < end && buffer[end - 1] <= ' ') {
+ end--;
+ }
+
+ // Skip leading whitespace
+ while (start < end && buffer[start] <= ' ') {
+ start++;
+ }
+
+ // Return the remaining substring
+ return new String(buffer, start, end - start, ISO_8859_1);
+ }
+}
[39/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
Convert new lines from windows to unix
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c7a6bcac
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c7a6bcac
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c7a6bcac
Branch: refs/heads/2.x
Commit: c7a6bcac422f10d130399f6ff5446e24c3f50ac5
Parents: dd3c2a4
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 07:10:47 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 07:10:53 2016 -0400
----------------------------------------------------------------------
.../ConfigurableThreadPoolExecutor.java | 64 +-
.../concurrent/SimpleThreadPoolExecutor.java | 80 +-
.../apache/tika/detect/AbstractDetector.java | 86 +-
.../org/apache/tika/detect/DetectorProxy.java | 134 +-
.../tika/detect/EncodingDetectorProxy.java | 82 +-
.../java/org/apache/tika/io/StringUtil.java | 242 +-
.../tika/osgi/TikaAbstractBundleActivator.java | 142 +-
.../java/org/apache/tika/osgi/TikaService.java | 50 +-
.../tika/osgi/internal/TikaServiceImpl.java | 162 +-
.../org/apache/tika/parser/ParserProxy.java | 148 +-
.../org/apache/tika/utils/ConcurrentUtils.java | 114 +-
.../org/apache/tika/config/DummyExecutor.java | 60 +-
.../apache/tika/detect/DetectorProxyTest.java | 112 +-
.../apache/tika/detect/DummyProxyDetector.java | 62 +-
.../apache/tika/parser/DummyProxyParser.java | 88 +-
.../org/apache/tika/parser/ParserProxyTest.java | 130 +-
.../apache/tika/utils/ConcurrentUtilsTest.java | 126 +-
.../services/org.apache.tika.parser.Parser | 34 +-
.../apache/tika/config/TIKA-1762-executors.xml | 56 +-
tika-parser-bundles/pom.xml | 350 +--
.../tika-parser-advanced-bundle/pom.xml | 162 +-
.../tika-parser-cad-bundle/pom.xml | 144 +-
.../tika-parser-code-bundle/pom.xml | 148 +-
.../tika-parser-crypto-bundle/pom.xml | 156 +-
.../tika-parser-database-bundle/pom.xml | 134 +-
.../tika-parser-ebook-bundle/pom.xml | 142 +-
.../tika-parser-journal-bundle/pom.xml | 158 +-
.../tika-parser-multimedia-bundle/pom.xml | 168 +-
.../tika-parser-office-bundle/pom.xml | 280 +-
.../tika-parser-package-bundle/pom.xml | 158 +-
.../tika-parser-pdf-bundle/pom.xml | 216 +-
.../tika-parser-scientific-bundle/pom.xml | 402 +--
.../tika-parser-text-bundle/pom.xml | 156 +-
.../tika-parser-web-bundle/pom.xml | 184 +-
tika-parser-modules/pom.xml | 410 +--
.../tika-parser-advanced-module/pom.xml | 136 +-
.../module/advanced/internal/Activator.java | 72 +-
.../tika-parser-cad-module/pom.xml | 110 +-
.../tika/module/cad/internal/Activator.java | 72 +-
.../org/apache/tika/parser/dwg/DWGParser.java | 712 ++---
.../tika-parser-code-module/pom.xml | 136 +-
.../tika/module/code/internal/Activator.java | 72 +-
.../org/apache/tika/parser/asm/ClassParser.java | 108 +-
.../tika/parser/asm/XHTMLClassVisitor.java | 646 ++--
.../tika/parser/code/SourceCodeParser.java | 284 +-
.../apache/tika/parser/asm/ClassParserTest.java | 118 +-
.../tika/parser/code/SourceCodeParserTest.java | 202 +-
.../tika-parser-crypto-module/pom.xml | 104 +-
.../tika/module/crypto/internal/Activator.java | 72 +-
.../tika/parser/crypto/Pkcs7ParserTest.java | 94 +-
.../tika-parser-database-module/pom.xml | 132 +-
.../module/database/internal/Activator.java | 72 +-
.../tika-parser-ebook-module/pom.xml | 94 +-
.../tika/module/ebook/internal/Activator.java | 72 +-
.../tika/parser/epub/EpubContentParser.java | 118 +-
.../org/apache/tika/parser/epub/EpubParser.java | 238 +-
.../apache/tika/parser/epub/EpubParserTest.java | 116 +-
.../tika-parser-journal-module/pom.xml | 134 +-
.../tika/module/journal/internal/Activator.java | 72 +-
.../tika-parser-multimedia-module/pom.xml | 200 +-
.../module/multimedia/internal/Activator.java | 72 +-
.../apache/tika/parser/audio/AudioParser.java | 278 +-
.../apache/tika/parser/audio/MidiParser.java | 242 +-
.../apache/tika/parser/font/TrueTypeParser.java | 222 +-
.../parser/image/ImageMetadataExtractor.java | 1124 +++----
.../apache/tika/parser/image/ImageParser.java | 406 +--
.../tika/parser/image/MetadataFields.java | 168 +-
.../apache/tika/parser/image/TiffParser.java | 136 +-
.../org/apache/tika/parser/jpeg/JpegParser.java | 138 +-
.../org/apache/tika/parser/mp3/AudioFrame.java | 504 ++--
.../tika/parser/mp3/CompositeTagHandler.java | 284 +-
.../org/apache/tika/parser/mp3/ID3Tags.java | 508 ++--
.../apache/tika/parser/mp3/ID3v1Handler.java | 366 +--
.../apache/tika/parser/mp3/ID3v22Handler.java | 318 +-
.../apache/tika/parser/mp3/ID3v23Handler.java | 276 +-
.../apache/tika/parser/mp3/ID3v24Handler.java | 286 +-
.../org/apache/tika/parser/mp3/ID3v2Frame.java | 848 +++---
.../apache/tika/parser/mp3/LyricsHandler.java | 312 +-
.../org/apache/tika/parser/mp3/MP3Frame.java | 50 +-
.../org/apache/tika/parser/mp3/Mp3Parser.java | 492 +--
.../org/apache/tika/parser/video/FLVParser.java | 536 ++--
.../parser/ocr/TesseractOCRConfig.properties | 40 +-
.../tika/parser/audio/AudioParserTest.java | 150 +-
.../tika/parser/audio/MidiParserTest.java | 84 +-
.../image/ImageMetadataExtractorTest.java | 278 +-
.../tika/parser/image/ImageParserTest.java | 324 +-
.../tika/parser/image/MetadataFieldsTest.java | 72 +-
.../tika/parser/image/TiffParserTest.java | 132 +-
.../apache/tika/parser/jpeg/JpegParserTest.java | 568 ++--
.../apache/tika/parser/mp3/Mp3ParserTest.java | 828 ++---
.../tika/parser/ocr/TesseractOCRConfigTest.java | 184 +-
.../apache/tika/parser/video/FLVParserTest.java | 88 +-
.../tika-parser-office-module/pom.xml | 250 +-
.../tika/module/office/internal/Activator.java | 72 +-
.../org/apache/tika/parser/chm/ChmParser.java | 224 +-
.../tika/parser/chm/accessor/ChmAccessor.java | 78 +-
.../chm/accessor/ChmDirectoryListingSet.java | 796 ++---
.../tika/parser/chm/accessor/ChmItsfHeader.java | 984 +++---
.../tika/parser/chm/accessor/ChmItspHeader.java | 1096 +++----
.../parser/chm/accessor/ChmLzxcControlData.java | 638 ++--
.../parser/chm/accessor/ChmLzxcResetTable.java | 682 ++---
.../tika/parser/chm/accessor/ChmPmgiHeader.java | 352 +--
.../tika/parser/chm/accessor/ChmPmglHeader.java | 412 +--
.../chm/accessor/DirectoryListingEntry.java | 302 +-
.../tika/parser/chm/assertion/ChmAssert.java | 338 +--
.../apache/tika/parser/chm/core/ChmCommons.java | 722 ++---
.../tika/parser/chm/core/ChmConstants.java | 204 +-
.../tika/parser/chm/core/ChmExtractor.java | 784 ++---
.../apache/tika/parser/chm/core/ChmWrapper.java | 294 +-
.../chm/exception/ChmParsingException.java | 54 +-
.../tika/parser/chm/lzx/ChmBlockInfo.java | 470 +--
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 1826 +++++------
.../apache/tika/parser/chm/lzx/ChmLzxState.java | 654 ++--
.../apache/tika/parser/chm/lzx/ChmSection.java | 444 +--
.../org/apache/tika/parser/mbox/MboxParser.java | 418 +--
.../tika/parser/mbox/OutlookPSTParser.java | 406 +--
.../parser/odf/NSNormalizerContentHandler.java | 198 +-
.../parser/odf/OpenDocumentContentParser.java | 992 +++---
.../tika/parser/odf/OpenDocumentMetaParser.java | 398 +--
.../tika/parser/odf/OpenDocumentParser.java | 450 +--
.../org/apache/tika/parser/opc/OPCDetector.java | 310 +-
.../parser/opendocument/OpenOfficeParser.java | 56 +-
.../org/apache/tika/parser/rtf/GroupState.java | 134 +-
.../apache/tika/parser/rtf/ListDescriptor.java | 70 +-
.../org/apache/tika/parser/rtf/RTFParser.java | 186 +-
.../apache/tika/parser/rtf/TextExtractor.java | 2846 +++++++++---------
.../tika/parser/chm/TestChmBlockInfo.java | 250 +-
.../tika/parser/chm/TestChmExtraction.java | 424 +--
.../tika/parser/chm/TestChmExtractor.java | 126 +-
.../tika/parser/chm/TestChmItsfHeader.java | 244 +-
.../tika/parser/chm/TestChmItspHeader.java | 320 +-
.../apache/tika/parser/chm/TestChmLzxState.java | 202 +-
.../tika/parser/chm/TestChmLzxcControlData.java | 288 +-
.../tika/parser/chm/TestChmLzxcResetTable.java | 312 +-
.../parser/chm/TestDirectoryListingEntry.java | 170 +-
.../apache/tika/parser/chm/TestParameters.java | 208 +-
.../apache/tika/parser/chm/TestPmgiHeader.java | 90 +-
.../apache/tika/parser/chm/TestPmglHeader.java | 152 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 312 +-
.../tika/parser/mbox/OutlookPSTParserTest.java | 220 +-
.../AbstractPOIContainerExtractionTest.java | 150 +-
.../tika/parser/microsoft/ExcelParserTest.java | 824 ++---
.../tika/parser/microsoft/OfficeParserTest.java | 92 +-
.../parser/microsoft/OutlookParserTest.java | 478 +--
.../microsoft/POIContainerExtractionTest.java | 764 ++---
.../parser/microsoft/PowerPointParserTest.java | 502 +--
.../parser/microsoft/PublisherParserTest.java | 106 +-
.../tika/parser/microsoft/TNEFParserTest.java | 196 +-
.../tika/parser/microsoft/VisioParserTest.java | 102 +-
.../tika/parser/microsoft/WordParserTest.java | 1012 +++----
.../apache/tika/parser/odf/ODFParserTest.java | 680 ++---
.../apache/tika/parser/rtf/RTFParserTest.java | 1020 +++----
.../tika-parser-package-module/pom.xml | 150 +-
.../tika/module/pkg/internal/Activator.java | 72 +-
.../tika/parser/iwork/AutoPageNumberUtils.java | 224 +-
.../tika/parser/iwork/IWorkPackageParser.java | 438 +--
.../parser/iwork/KeynoteContentHandler.java | 348 +--
.../parser/iwork/NumbersContentHandler.java | 462 +--
.../tika/parser/iwork/PagesContentHandler.java | 896 +++---
.../apache/tika/parser/pkg/PackageParser.java | 574 ++--
.../tika/parser/pkg/ZipContainerDetector.java | 648 ++--
.../parser/iwork/AutoPageNumberUtilsTest.java | 156 +-
.../tika/parser/iwork/IWorkParserTest.java | 932 +++---
.../apache/tika/parser/pkg/AbstractPkgTest.java | 186 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 178 +-
.../apache/tika/parser/pkg/GzipParserTest.java | 204 +-
.../apache/tika/parser/pkg/TarParserTest.java | 210 +-
.../apache/tika/parser/pkg/ZipParserTest.java | 384 +--
.../tika-parser-pdf-module/pom.xml | 250 +-
.../tika/module/pdf/internal/Activator.java | 72 +-
.../tika-parser-scientific-module/pom.xml | 270 +-
.../module/scientific/internal/Activator.java | 72 +-
.../org/apache/tika/parser/hdf/HDFParser.java | 244 +-
.../apache/tika/parser/hdf/HDFParserTest.java | 144 +-
.../tika/parser/netcdf/NetCDFParserTest.java | 122 +-
.../tika-parser-text-module/pom.xml | 132 +-
.../tika/module/text/internal/Activator.java | 40 +-
.../apache/tika/parser/txt/CharsetDetector.java | 1088 +++----
.../apache/tika/parser/txt/CharsetMatch.java | 572 ++--
.../tika/parser/txt/CharsetRecog_2022.java | 326 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 198 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 278 +-
.../tika/parser/txt/CharsetRecog_mbcs.java | 1064 +++----
.../tika/parser/txt/CharsetRecog_sbcs.java | 2706 ++++++++---------
.../tika/parser/txt/CharsetRecognizer.java | 108 +-
.../org/apache/tika/parser/txt/TXTParser.java | 196 +-
.../parser/xml/AbstractMetadataHandler.java | 186 +-
.../xml/AttributeDependantMetadataHandler.java | 164 +-
.../parser/xml/AttributeMetadataHandler.java | 122 +-
.../org/apache/tika/parser/xml/DcXMLParser.java | 120 +-
.../tika/parser/xml/ElementMetadataHandler.java | 510 ++--
.../tika/parser/xml/FictionBookParser.java | 234 +-
.../apache/tika/parser/xml/MetadataHandler.java | 170 +-
.../org/apache/tika/parser/xml/XMLParser.java | 178 +-
.../apache/tika/parser/txt/TXTParserTest.java | 548 ++--
.../apache/tika/parser/xml/DcXMLParserTest.java | 174 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 232 +-
.../tika/parser/xml/FictionBookParserTest.java | 108 +-
.../tika-parser-web-module/pom.xml | 178 +-
.../tika/module/web/internal/Activator.java | 72 +-
.../org/apache/tika/parser/feed/FeedParser.java | 254 +-
.../parser/html/BoilerpipeContentHandler.java | 694 ++---
.../tika/parser/html/DefaultHtmlMapper.java | 274 +-
.../apache/tika/parser/html/HtmlHandler.java | 618 ++--
.../org/apache/tika/parser/html/HtmlMapper.java | 138 +-
.../org/apache/tika/parser/html/HtmlParser.java | 388 +--
.../tika/parser/html/IdentityHtmlMapper.java | 86 +-
.../tika/parser/html/XHTMLDowngradeHandler.java | 156 +-
.../tika/parser/mail/MailContentHandler.java | 752 ++---
.../apache/tika/parser/mail/RFC822Parser.java | 190 +-
.../apache/tika/parser/feed/FeedParserTest.java | 150 +-
.../apache/tika/parser/html/HtmlParserTest.java | 2262 +++++++-------
.../tika/parser/mail/RFC822ParserTest.java | 970 +++---
213 files changed, 35548 insertions(+), 35548 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
index 86f74a7..1f7c4a0 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
@@ -1,32 +1,32 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.concurrent;
-
-import java.util.concurrent.ExecutorService;
-
-/**
- * Allows Thread Pool to be Configurable.
- *
- * @since Apache Tika 1.11
- */
-public interface ConfigurableThreadPoolExecutor extends ExecutorService {
-
- public void setMaximumPoolSize(int threads);
-
- public void setCorePoolSize(int threads);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.concurrent;
+
+import java.util.concurrent.ExecutorService;
+
+/**
+ * Allows Thread Pool to be Configurable.
+ *
+ * @since Apache Tika 1.11
+ */
+public interface ConfigurableThreadPoolExecutor extends ExecutorService {
+
+ public void setMaximumPoolSize(int threads);
+
+ public void setCorePoolSize(int threads);
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
index a7e443f..0a18e94 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
@@ -1,40 +1,40 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.concurrent;
-
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ThreadFactory;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-/**
- * Simple Thread Pool Executor
- *
- * @since Apache Tika 1.11
- */
-public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
-
- public SimpleThreadPoolExecutor() {
- super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactory() {
-
- @Override
- public Thread newThread(Runnable r) {
- return new Thread(r, "Tika Executor Thread");
- }
- });
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.concurrent;
+
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Simple Thread Pool Executor
+ *
+ * @since Apache Tika 1.11
+ */
+public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
+
+ public SimpleThreadPoolExecutor() {
+ super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactory() {
+
+ @Override
+ public Thread newThread(Runnable r) {
+ return new Thread(r, "Tika Executor Thread");
+ }
+ });
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java b/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
index f0d6129..952a089 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AbstractDetector.java
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-/**
- * Abstract base class for new detectors. This class has a convenience method for
- * creating a DetectorProxy
- *
- * @since Apache Tika 2.0
- */
-public abstract class AbstractDetector implements Detector {
-
- /**
- * Serial version UID.
- */
- private static final long serialVersionUID = -5869078281784941763L;
-
- /**
- * Convenience method for creating DetectorProxy instances
- * with the current class' ClassLoader
- *
- * @param detectorClassName
- * @return
- */
- public Detector createDetectorProxy(String detectorClassName){
- return new DetectorProxy(detectorClassName, getClass().getClassLoader());
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Abstract base class for new detectors. This class has a convenience method for
+ * creating a DetectorProxy
+ *
+ * @since Apache Tika 2.0
+ */
+public abstract class AbstractDetector implements Detector {
+
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = -5869078281784941763L;
+
+ /**
+ * Convenience method for creating DetectorProxy instances
+ * with the current class' ClassLoader
+ *
+ * @param detectorClassName
+ * @return
+ */
+ public Detector createDetectorProxy(String detectorClassName){
+ return new DetectorProxy(detectorClassName, getClass().getClassLoader());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java b/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
index ed5e638..404ec0a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DetectorProxy.java
@@ -1,68 +1,68 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * This detector is a proxy for another detector
- * this allows modules to use detectors from other modules
- * as optional dependencies since not including the classes
- * simply does nothing rather than throwing a ClassNotFoundException.
- *
- * @since Apache Tika 2.0
- */
-public class DetectorProxy implements Detector
-{
- private static final long serialVersionUID = 4534101565629801667L;
-
- private Detector detector;
-
- public DetectorProxy(String detectorClassName, ClassLoader loader)
- {
- this(detectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
- ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
- }
-
- public DetectorProxy(String detectorClassName, ClassLoader loader, LoadErrorHandler handler)
- {
- try
- {
- this.detector = (Detector)Class.forName(detectorClassName, true, loader).newInstance();
- }
- catch (InstantiationException | IllegalAccessException | ClassNotFoundException e)
- {
- handler.handleLoadError(detectorClassName, e);
- }
- }
-
- @Override
- public MediaType detect(InputStream input, Metadata metadata) throws IOException
- {
- if(detector != null)
- {
- return detector.detect(input, metadata);
- }
- return null;
- }
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * This detector is a proxy for another detector
+ * this allows modules to use detectors from other modules
+ * as optional dependencies since not including the classes
+ * simply does nothing rather than throwing a ClassNotFoundException.
+ *
+ * @since Apache Tika 2.0
+ */
+public class DetectorProxy implements Detector
+{
+ private static final long serialVersionUID = 4534101565629801667L;
+
+ private Detector detector;
+
+ public DetectorProxy(String detectorClassName, ClassLoader loader)
+ {
+ this(detectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
+ ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+ }
+
+ public DetectorProxy(String detectorClassName, ClassLoader loader, LoadErrorHandler handler)
+ {
+ try
+ {
+ this.detector = (Detector)Class.forName(detectorClassName, true, loader).newInstance();
+ }
+ catch (InstantiationException | IllegalAccessException | ClassNotFoundException e)
+ {
+ handler.handleLoadError(detectorClassName, e);
+ }
+ }
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException
+ {
+ if(detector != null)
+ {
+ return detector.detect(input, metadata);
+ }
+ return null;
+ }
+
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
index b927597..5e569bd 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorProxy.java
@@ -1,41 +1,41 @@
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.metadata.Metadata;
-
-public class EncodingDetectorProxy implements EncodingDetector {
-
-private EncodingDetector detector;
-
- public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader)
- {
- this(encodingDetectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
- ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
- }
-
- public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader, LoadErrorHandler handler)
- {
- try
- {
- this.detector = (EncodingDetector)Class.forName(encodingDetectorClassName, true, loader).newInstance();
- }
- catch (InstantiationException | IllegalAccessException | ClassNotFoundException e)
- {
- handler.handleLoadError(encodingDetectorClassName, e);
- }
- }
-
- @Override
- public Charset detect(InputStream input, Metadata metadata) throws IOException {
- if(detector != null)
- {
- return detector.detect(input, metadata);
- }
- return null;
- }
-
-}
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.metadata.Metadata;
+
+public class EncodingDetectorProxy implements EncodingDetector {
+
+private EncodingDetector detector;
+
+ public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader)
+ {
+ this(encodingDetectorClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
+ ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+ }
+
+ public EncodingDetectorProxy(String encodingDetectorClassName, ClassLoader loader, LoadErrorHandler handler)
+ {
+ try
+ {
+ this.detector = (EncodingDetector)Class.forName(encodingDetectorClassName, true, loader).newInstance();
+ }
+ catch (InstantiationException | IllegalAccessException | ClassNotFoundException e)
+ {
+ handler.handleLoadError(encodingDetectorClassName, e);
+ }
+ }
+
+ @Override
+ public Charset detect(InputStream input, Metadata metadata) throws IOException {
+ if(detector != null)
+ {
+ return detector.detect(input, metadata);
+ }
+ return null;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/StringUtil.java b/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
index 164765a..8876a0d 100644
--- a/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/io/StringUtil.java
@@ -1,121 +1,121 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.io;
-
-import java.nio.charset.Charset;
-
-/**
- * General String Related Utilities.
- * <p>
- * This class provides static utility methods for string operations
- * <p>
- * Origin of code: Based on the version in POI
- */
-public class StringUtil {
-
- protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
- protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
- public static final Charset UTF8 = Charset.forName("UTF-8");
-
- private StringUtil() {
- // no instances of this class
- }
-
- /**
- * Given a byte array of 16-bit unicode characters in Little Endian
- * format (most important byte last), return a Java String representation
- * of it.
- *
- * { 0x16, 0x00 } -0x16
- *
- * @param string the byte array to be converted
- * @param offset the initial offset into the
- * byte array. it is assumed that string[ offset ] and string[ offset +
- * 1 ] contain the first 16-bit unicode character
- * @param len the length of the final string
- * @return the converted string, never <code>null</code>.
- * @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
- * the byte array (i.e., is negative or is greater than or equal to
- * string.length)
- * @exception IllegalArgumentException if len is too large (i.e.,
- * there is not enough data in string to create a String of that
- * length)
- */
- public static String getFromUnicodeLE(
- final byte[] string,
- final int offset,
- final int len)
- throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
- if ((offset < 0) || (offset >= string.length)) {
- throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
- }
- if ((len < 0) || (((string.length - offset) / 2) < len)) {
- throw new IllegalArgumentException("Illegal length " + len);
- }
-
- return new String(string, offset, len * 2, UTF16LE);
- }
-
- /**
- * Given a byte array of 16-bit unicode characters in little endian
- * format (most important byte last), return a Java String representation
- * of it.
- *
- * { 0x16, 0x00 } -0x16
- *
- * @param string the byte array to be converted
- * @return the converted string, never <code>null</code>
- */
- public static String getFromUnicodeLE(byte[] string) {
- if(string.length == 0) { return ""; }
- return getFromUnicodeLE(string, 0, string.length / 2);
- }
-
- /**
- * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
- * String and return.
- * (In Excel terms, read compressed 8 bit unicode as a string)
- *
- * @param string byte array to read
- * @param offset offset to read byte array
- * @param len length to read byte array
- * @return String generated String instance by reading byte array
- */
- public static String getFromCompressedUnicode(
- final byte[] string,
- final int offset,
- final int len) {
- int len_to_use = Math.min(len, string.length - offset);
- return new String(string, offset, len_to_use, ISO_8859_1);
- }
-
- /**
- * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
- * codepage).
- * (In Excel terms, write compressed 8 bit unicode)
- *
- * @param input the String containing the data to be written
- * @param output the byte array to which the data is to be written
- * @param offset an offset into the byte arrat at which the data is start
- * when written
- */
- public static void putCompressedUnicode(String input, byte[] output, int offset) {
- byte[] bytes = input.getBytes(ISO_8859_1);
- System.arraycopy(bytes, 0, output, offset, bytes.length);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.nio.charset.Charset;
+
+/**
+ * General String Related Utilities.
+ * <p>
+ * This class provides static utility methods for string operations
+ * <p>
+ * Origin of code: Based on the version in POI
+ */
+public class StringUtil {
+
+ protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
+ protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
+ public static final Charset UTF8 = Charset.forName("UTF-8");
+
+ private StringUtil() {
+ // no instances of this class
+ }
+
+ /**
+ * Given a byte array of 16-bit unicode characters in Little Endian
+ * format (most important byte last), return a Java String representation
+ * of it.
+ *
+ * { 0x16, 0x00 } -0x16
+ *
+ * @param string the byte array to be converted
+ * @param offset the initial offset into the
+ * byte array. it is assumed that string[ offset ] and string[ offset +
+ * 1 ] contain the first 16-bit unicode character
+ * @param len the length of the final string
+ * @return the converted string, never <code>null</code>.
+ * @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
+ * the byte array (i.e., is negative or is greater than or equal to
+ * string.length)
+ * @exception IllegalArgumentException if len is too large (i.e.,
+ * there is not enough data in string to create a String of that
+ * length)
+ */
+ public static String getFromUnicodeLE(
+ final byte[] string,
+ final int offset,
+ final int len)
+ throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
+ if ((offset < 0) || (offset >= string.length)) {
+ throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
+ }
+ if ((len < 0) || (((string.length - offset) / 2) < len)) {
+ throw new IllegalArgumentException("Illegal length " + len);
+ }
+
+ return new String(string, offset, len * 2, UTF16LE);
+ }
+
+ /**
+ * Given a byte array of 16-bit unicode characters in little endian
+ * format (most important byte last), return a Java String representation
+ * of it.
+ *
+ * { 0x16, 0x00 } -0x16
+ *
+ * @param string the byte array to be converted
+ * @return the converted string, never <code>null</code>
+ */
+ public static String getFromUnicodeLE(byte[] string) {
+ if(string.length == 0) { return ""; }
+ return getFromUnicodeLE(string, 0, string.length / 2);
+ }
+
+ /**
+ * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
+ * String and return.
+ * (In Excel terms, read compressed 8 bit unicode as a string)
+ *
+ * @param string byte array to read
+ * @param offset offset to read byte array
+ * @param len length to read byte array
+ * @return String generated String instance by reading byte array
+ */
+ public static String getFromCompressedUnicode(
+ final byte[] string,
+ final int offset,
+ final int len) {
+ int len_to_use = Math.min(len, string.length - offset);
+ return new String(string, offset, len_to_use, ISO_8859_1);
+ }
+
+ /**
+ * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
+ * codepage).
+ * (In Excel terms, write compressed 8 bit unicode)
+ *
+ * @param input the String containing the data to be written
+ * @param output the byte array to which the data is to be written
+ * @param offset an offset into the byte arrat at which the data is start
+ * when written
+ */
+ public static void putCompressedUnicode(String input, byte[] output, int offset) {
+ byte[] bytes = input.getBytes(ISO_8859_1);
+ System.arraycopy(bytes, 0, output, offset, bytes.length);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java b/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
index b959147..52a43dc 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/TikaAbstractBundleActivator.java
@@ -1,71 +1,71 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi;
-
-import java.util.Dictionary;
-import java.util.Enumeration;
-import java.util.Locale;
-import java.util.Properties;
-import java.util.ServiceLoader;
-
-import org.apache.tika.parser.Parser;
-import org.osgi.framework.BundleActivator;
-import org.osgi.framework.BundleContext;
-import org.osgi.framework.Constants;
-
-public abstract class TikaAbstractBundleActivator implements BundleActivator {
-
- Dictionary createServiceRankProperties(String configName, BundleContext context) {
- Dictionary serviceProps = new Properties();
- String serviceRank = context.getProperty(configName);
- if (serviceRank != null) {
- serviceProps.put(Constants.SERVICE_RANKING, Integer.parseInt(serviceRank));
- }
- return serviceProps;
-
- }
-
- public void registerTikaParserServiceLoader(BundleContext context, ClassLoader loader)
- {
- ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class, loader);
- for(Parser currentParser: serviceLoader)
- {
- registerTikaService(context, currentParser, null);
- }
- }
-
- void registerTikaService(BundleContext context, Parser parserService,
- Dictionary additionalServiceProperties) {
- String parserFullyClassifiedName = parserService.getClass().getCanonicalName().toLowerCase(Locale.US);
-
- String serviceRankingPropName = parserFullyClassifiedName + ".serviceRanking";
-
- Dictionary serviceProperties = createServiceRankProperties(serviceRankingPropName, context);
-
- if (additionalServiceProperties != null) {
- Enumeration keys = additionalServiceProperties.keys();
- while (keys.hasMoreElements()) {
- String currentKey = (String) keys.nextElement();
- serviceProperties.put(currentKey, additionalServiceProperties.get(currentKey));
- }
-
- }
-
- context.registerService(Parser.class, parserService, serviceProperties);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi;
+
+import java.util.Dictionary;
+import java.util.Enumeration;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.ServiceLoader;
+
+import org.apache.tika.parser.Parser;
+import org.osgi.framework.BundleActivator;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.Constants;
+
+public abstract class TikaAbstractBundleActivator implements BundleActivator {
+
+ Dictionary createServiceRankProperties(String configName, BundleContext context) {
+ Dictionary serviceProps = new Properties();
+ String serviceRank = context.getProperty(configName);
+ if (serviceRank != null) {
+ serviceProps.put(Constants.SERVICE_RANKING, Integer.parseInt(serviceRank));
+ }
+ return serviceProps;
+
+ }
+
+ public void registerTikaParserServiceLoader(BundleContext context, ClassLoader loader)
+ {
+ ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class, loader);
+ for(Parser currentParser: serviceLoader)
+ {
+ registerTikaService(context, currentParser, null);
+ }
+ }
+
+ void registerTikaService(BundleContext context, Parser parserService,
+ Dictionary additionalServiceProperties) {
+ String parserFullyClassifiedName = parserService.getClass().getCanonicalName().toLowerCase(Locale.US);
+
+ String serviceRankingPropName = parserFullyClassifiedName + ".serviceRanking";
+
+ Dictionary serviceProperties = createServiceRankProperties(serviceRankingPropName, context);
+
+ if (additionalServiceProperties != null) {
+ Enumeration keys = additionalServiceProperties.keys();
+ while (keys.hasMoreElements()) {
+ String currentKey = (String) keys.nextElement();
+ serviceProperties.put(currentKey, additionalServiceProperties.get(currentKey));
+ }
+
+ }
+
+ context.registerService(Parser.class, parserService, serviceProperties);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java b/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
index 4ada094..283ae1c 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/TikaService.java
@@ -1,25 +1,25 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi;
-
-import org.apache.tika.detect.Detector;
-import org.apache.tika.language.translate.Translator;
-import org.apache.tika.parser.Parser;
-
-public interface TikaService extends Parser, Detector, Translator {
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.parser.Parser;
+
+public interface TikaService extends Parser, Detector, Translator {
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java b/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
index f3b6171..fefa1af 100644
--- a/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
+++ b/tika-core/src/main/java/org/apache/tika/osgi/internal/TikaServiceImpl.java
@@ -1,81 +1,81 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.osgi.internal;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.osgi.TikaService;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class TikaServiceImpl implements TikaService {
-
- private static final long serialVersionUID = 1L;
-
- private final Tika tika;
-
- public TikaServiceImpl() {
- this.tika = new Tika();
- }
-
- public TikaServiceImpl(TikaConfig config)
- {
- this.tika = new Tika(config);
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return this.tika.getParser().getSupportedTypes(context);
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- tika.getParser().parse(stream, handler, metadata, context);
-
- }
-
- @Override
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- return tika.getDetector().detect(input, metadata);
- }
-
- @Override
- public String translate(String text, String sourceLanguage, String targetLanguage)
- throws TikaException, IOException {
- return tika.getTranslator().translate(text, sourceLanguage, targetLanguage);
- }
-
- @Override
- public String translate(String text, String targetLanguage) throws TikaException, IOException {
- return tika.getTranslator().translate(text, targetLanguage);
- }
-
- @Override
- public boolean isAvailable() {
- return tika.getTranslator().isAvailable();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.osgi.internal;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.osgi.TikaService;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaServiceImpl implements TikaService {
+
+ private static final long serialVersionUID = 1L;
+
+ private final Tika tika;
+
+ public TikaServiceImpl() {
+ this.tika = new Tika();
+ }
+
+ public TikaServiceImpl(TikaConfig config)
+ {
+ this.tika = new Tika(config);
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return this.tika.getParser().getSupportedTypes(context);
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ tika.getParser().parse(stream, handler, metadata, context);
+
+ }
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ return tika.getDetector().detect(input, metadata);
+ }
+
+ @Override
+ public String translate(String text, String sourceLanguage, String targetLanguage)
+ throws TikaException, IOException {
+ return tika.getTranslator().translate(text, sourceLanguage, targetLanguage);
+ }
+
+ @Override
+ public String translate(String text, String targetLanguage) throws TikaException, IOException {
+ return tika.getTranslator().translate(text, targetLanguage);
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return tika.getTranslator().isAvailable();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java b/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
index 9f363f6..8c99d17 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserProxy.java
@@ -1,74 +1,74 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * This parser is a proxy for another detector this allows modules to use
- * parsers from other modules as optional dependencies since not including the
- * classes simply does nothing rather than throwing a ClassNotFoundException.
- *
- * @since Apache Tika 2.0
- */
-public class ParserProxy extends AbstractParser {
-
- private static final long serialVersionUID = -4838436708916910179L;
- private Parser parser;
-
- public ParserProxy(String parserClassName, ClassLoader loader) {
-
- this(parserClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
- ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
- }
-
- public ParserProxy(String parserClassName, ClassLoader loader, LoadErrorHandler handler) {
- try {
- this.parser = (Parser) Class.forName(parserClassName, true, loader).newInstance();
- } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
- handler.handleLoadError(parserClassName, e);
- }
-
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- if (parser == null) {
- return Collections.emptySet();
- }
- return parser.getSupportedTypes(context);
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- if (parser != null) {
- parser.parse(stream, handler, metadata, context);
- }
- // Otherwise do nothing
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This parser is a proxy for another detector this allows modules to use
+ * parsers from other modules as optional dependencies since not including the
+ * classes simply does nothing rather than throwing a ClassNotFoundException.
+ *
+ * @since Apache Tika 2.0
+ */
+public class ParserProxy extends AbstractParser {
+
+ private static final long serialVersionUID = -4838436708916910179L;
+ private Parser parser;
+
+ public ParserProxy(String parserClassName, ClassLoader loader) {
+
+ this(parserClassName, loader, Boolean.getBoolean("org.apache.tika.service.proxy.error.warn")
+ ? LoadErrorHandler.WARN:LoadErrorHandler.IGNORE);
+ }
+
+ public ParserProxy(String parserClassName, ClassLoader loader, LoadErrorHandler handler) {
+ try {
+ this.parser = (Parser) Class.forName(parserClassName, true, loader).newInstance();
+ } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
+ handler.handleLoadError(parserClassName, e);
+ }
+
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ if (parser == null) {
+ return Collections.emptySet();
+ }
+ return parser.getSupportedTypes(context);
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (parser != null) {
+ parser.parse(stream, handler, metadata, context);
+ }
+ // Otherwise do nothing
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
index 5f4cd13..a47f747 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
@@ -1,57 +1,57 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.utils;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.FutureTask;
-
-import org.apache.tika.parser.ParseContext;
-
-/**
- * Utility Class for Concurrency in Tika
- *
- * @since Apache Tika 1.11
- */
-public class ConcurrentUtils {
-
- /**
- *
- * Execute a runnable using an ExecutorService from the ParseContext if possible.
- * Otherwise fallback to individual threads.
- *
- * @param context
- * @param runnable
- * @return
- */
- public static Future execute(ParseContext context, Runnable runnable) {
-
- Future future = null;
- ExecutorService executorService = context.get(ExecutorService.class);
- if(executorService == null) {
- FutureTask task = new FutureTask<>(runnable, null);
- Thread thread = new Thread(task, "Tika Thread");
- thread.start();
- future = task;
- }
- else {
- future = executorService.submit(runnable);
- }
-
- return future;
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.FutureTask;
+
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Utility Class for Concurrency in Tika
+ *
+ * @since Apache Tika 1.11
+ */
+public class ConcurrentUtils {
+
+ /**
+ *
+ * Execute a runnable using an ExecutorService from the ParseContext if possible.
+ * Otherwise fallback to individual threads.
+ *
+ * @param context
+ * @param runnable
+ * @return
+ */
+ public static Future execute(ParseContext context, Runnable runnable) {
+
+ Future future = null;
+ ExecutorService executorService = context.get(ExecutorService.class);
+ if(executorService == null) {
+ FutureTask task = new FutureTask<>(runnable, null);
+ Thread thread = new Thread(task, "Tika Thread");
+ thread.start();
+ future = task;
+ }
+ else {
+ future = executorService.submit(runnable);
+ }
+
+ return future;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
index 849eda3..5da9d0d 100644
--- a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
+++ b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
@@ -1,30 +1,30 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
-
-class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
- public DummyExecutor()
- {
- super(1,1, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
+
+class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
+ public DummyExecutor()
+ {
+ super(1,1, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java b/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
index 060f3d9..33683a9 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/DetectorProxyTest.java
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import static org.junit.Assert.*;
-
-import java.io.IOException;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.mime.MediaType;
-import org.junit.Test;
-
-public class DetectorProxyTest
-{
- @Test
- public void testDetectorProxyExists() throws IOException
- {
- Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DummyProxyDetector",
- getClass().getClassLoader(),
- LoadErrorHandler.IGNORE);
-
- MediaType result = dummyDetector.detect(null, null);
-
- assertEquals("Detector being proxied exists so result should not be null",
- MediaType.TEXT_PLAIN, result );
-
- }
-
- @Test
- public void testParserProxyNotExists() throws IOException
- {
- Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DoesNotExist",
- getClass().getClassLoader(),
- LoadErrorHandler.IGNORE);
-
- MediaType result = dummyDetector.detect(null, null);
-
- assertNull("Detector being proxied does not exists so result should be null", result );
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+public class DetectorProxyTest
+{
+ @Test
+ public void testDetectorProxyExists() throws IOException
+ {
+ Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DummyProxyDetector",
+ getClass().getClassLoader(),
+ LoadErrorHandler.IGNORE);
+
+ MediaType result = dummyDetector.detect(null, null);
+
+ assertEquals("Detector being proxied exists so result should not be null",
+ MediaType.TEXT_PLAIN, result );
+
+ }
+
+ @Test
+ public void testParserProxyNotExists() throws IOException
+ {
+ Detector dummyDetector = new DetectorProxy("org.apache.tika.detect.DoesNotExist",
+ getClass().getClassLoader(),
+ LoadErrorHandler.IGNORE);
+
+ MediaType result = dummyDetector.detect(null, null);
+
+ assertNull("Detector being proxied does not exists so result should be null", result );
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java b/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
index a11b584..ce1207a 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/DummyProxyDetector.java
@@ -1,31 +1,31 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-public class DummyProxyDetector implements Detector
-{
- @Override
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- return MediaType.TEXT_PLAIN;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class DummyProxyDetector implements Detector
+{
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ return MediaType.TEXT_PLAIN;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
index ca766c9..4ae7898 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyProxyParser.java
@@ -1,44 +1,44 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class DummyProxyParser extends AbstractParser
-{
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context)
- {
- return null;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException
- {
- metadata.add("Test", "value");
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DummyProxyParser extends AbstractParser
+{
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context)
+ {
+ return null;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException
+ {
+ metadata.add("Test", "value");
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
index 20c6247..9f57965 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParserProxyTest.java
@@ -1,65 +1,65 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.Assert.*;
-
-import java.io.IOException;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-import org.xml.sax.SAXException;
-
-public class ParserProxyTest
-{
-
- @Test
- public void testParserProxyExists() throws IOException, SAXException, TikaException
- {
- Parser dummyParser = new ParserProxy("org.apache.tika.parser.DummyProxyParser",
- getClass().getClassLoader(),
- LoadErrorHandler.IGNORE);
-
- Metadata metadata = new Metadata();
-
- dummyParser.parse(null, null, metadata, null);
-
- assertEquals("Parser being proxied exists so metadata should be added",
- 1, metadata.size());
-
- }
-
- @Test
- public void testParserProxyNotExists() throws IOException, SAXException, TikaException
- {
- Parser dummyParser = new ParserProxy("org.apache.tika.parser.NotExists",
- getClass().getClassLoader(),
- LoadErrorHandler.IGNORE);
-
- Metadata metadata = new Metadata();
-
- dummyParser.parse(null, null, metadata, null);
-
- assertEquals("Parser being proxied doesn't exist so metadata not change",
- 0, metadata.size());
-
- }
-
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.tika.config.LoadErrorHandler;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class ParserProxyTest
+{
+
+ @Test
+ public void testParserProxyExists() throws IOException, SAXException, TikaException
+ {
+ Parser dummyParser = new ParserProxy("org.apache.tika.parser.DummyProxyParser",
+ getClass().getClassLoader(),
+ LoadErrorHandler.IGNORE);
+
+ Metadata metadata = new Metadata();
+
+ dummyParser.parse(null, null, metadata, null);
+
+ assertEquals("Parser being proxied exists so metadata should be added",
+ 1, metadata.size());
+
+ }
+
+ @Test
+ public void testParserProxyNotExists() throws IOException, SAXException, TikaException
+ {
+ Parser dummyParser = new ParserProxy("org.apache.tika.parser.NotExists",
+ getClass().getClassLoader(),
+ LoadErrorHandler.IGNORE);
+
+ Metadata metadata = new Metadata();
+
+ dummyParser.parse(null, null, metadata, null);
+
+ assertEquals("Parser being proxied doesn't exist so metadata not change",
+ 0, metadata.size());
+
+ }
+
+
+}
[08/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index 869facf..be6455f 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,139 +1,139 @@
-/*
- *******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- *******************************************************************************
- *
- */
-package org.apache.tika.parser.txt;
-
-/**
- * This class matches UTF-16 and UTF-32, both big- and little-endian. The
- * BOM will be used if it is present.
- *
- * @internal
- */
-abstract class CharsetRecog_Unicode extends CharsetRecognizer {
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#getName()
- */
- abstract String getName();
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
- */
- abstract int match(CharsetDetector det);
-
- static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
- String getName() {
- return "UTF-16BE";
- }
-
- int match(CharsetDetector det) {
- byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
- return 100;
- }
-
- // TODO: Do some statistics to check for unsigned UTF-16BE
- return 0;
- }
- }
-
- static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
- String getName() {
- return "UTF-16LE";
- }
-
- int match(CharsetDetector det) {
- byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
- // An LE BOM is present.
- if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
- // It is probably UTF-32 LE, not UTF-16
- return 0;
- }
- return 100;
- }
-
- // TODO: Do some statistics to check for unsigned UTF-16LE
- return 0;
- }
- }
-
- static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode {
- abstract int getChar(byte[] input, int index);
-
- abstract String getName();
-
- int match(CharsetDetector det) {
- byte[] input = det.fRawInput;
- int limit = (det.fRawLength / 4) * 4;
- int numValid = 0;
- int numInvalid = 0;
- boolean hasBOM = false;
- int confidence = 0;
-
- if (limit == 0) {
- return 0;
- }
- if (getChar(input, 0) == 0x0000FEFF) {
- hasBOM = true;
- }
-
- for (int i = 0; i < limit; i += 4) {
- int ch = getChar(input, i);
-
- if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
- numInvalid += 1;
- } else {
- numValid += 1;
- }
- }
-
-
- // Cook up some sort of confidence score, based on presence of a BOM
- // and the existence of valid and/or invalid multi-byte sequences.
- if (hasBOM && numInvalid == 0) {
- confidence = 100;
- } else if (hasBOM && numValid > numInvalid * 10) {
- confidence = 80;
- } else if (numValid > 3 && numInvalid == 0) {
- confidence = 100;
- } else if (numValid > 0 && numInvalid == 0) {
- confidence = 80;
- } else if (numValid > numInvalid * 10) {
- // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
- confidence = 25;
- }
-
- return confidence;
- }
- }
-
- static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
- int getChar(byte[] input, int index) {
- return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
- (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
- }
-
- String getName() {
- return "UTF-32BE";
- }
- }
-
-
- static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
- int getChar(byte[] input, int index) {
- return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
- (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
- }
-
- String getName() {
- return "UTF-32LE";
- }
- }
-}
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2007, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ *
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * This class matches UTF-16 and UTF-32, both big- and little-endian. The
+ * BOM will be used if it is present.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_Unicode extends CharsetRecognizer {
+
+ /* (non-Javadoc)
+ * @see com.ibm.icu.text.CharsetRecognizer#getName()
+ */
+ abstract String getName();
+
+ /* (non-Javadoc)
+ * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+ */
+ abstract int match(CharsetDetector det);
+
+ static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
+ String getName() {
+ return "UTF-16BE";
+ }
+
+ int match(CharsetDetector det) {
+ byte[] input = det.fRawInput;
+
+ if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
+ return 100;
+ }
+
+ // TODO: Do some statistics to check for unsigned UTF-16BE
+ return 0;
+ }
+ }
+
+ static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode {
+ String getName() {
+ return "UTF-16LE";
+ }
+
+ int match(CharsetDetector det) {
+ byte[] input = det.fRawInput;
+
+ if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
+ // An LE BOM is present.
+ if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
+ // It is probably UTF-32 LE, not UTF-16
+ return 0;
+ }
+ return 100;
+ }
+
+ // TODO: Do some statistics to check for unsigned UTF-16LE
+ return 0;
+ }
+ }
+
+ static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode {
+ abstract int getChar(byte[] input, int index);
+
+ abstract String getName();
+
+ int match(CharsetDetector det) {
+ byte[] input = det.fRawInput;
+ int limit = (det.fRawLength / 4) * 4;
+ int numValid = 0;
+ int numInvalid = 0;
+ boolean hasBOM = false;
+ int confidence = 0;
+
+ if (limit == 0) {
+ return 0;
+ }
+ if (getChar(input, 0) == 0x0000FEFF) {
+ hasBOM = true;
+ }
+
+ for (int i = 0; i < limit; i += 4) {
+ int ch = getChar(input, i);
+
+ if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
+ numInvalid += 1;
+ } else {
+ numValid += 1;
+ }
+ }
+
+
+ // Cook up some sort of confidence score, based on presence of a BOM
+ // and the existence of valid and/or invalid multi-byte sequences.
+ if (hasBOM && numInvalid == 0) {
+ confidence = 100;
+ } else if (hasBOM && numValid > numInvalid * 10) {
+ confidence = 80;
+ } else if (numValid > 3 && numInvalid == 0) {
+ confidence = 100;
+ } else if (numValid > 0 && numInvalid == 0) {
+ confidence = 80;
+ } else if (numValid > numInvalid * 10) {
+ // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
+ confidence = 25;
+ }
+
+ return confidence;
+ }
+ }
+
+ static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 {
+ int getChar(byte[] input, int index) {
+ return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
+ (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
+ }
+
+ String getName() {
+ return "UTF-32BE";
+ }
+ }
+
+
+ static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 {
+ int getChar(byte[] input, int index) {
+ return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
+ (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
+ }
+
+ String getName() {
+ return "UTF-32LE";
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 1c63f9e..35d2b4f 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,532 +1,532 @@
-/*
- ****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- ****************************************************************************
- *
- */
-package org.apache.tika.parser.txt;
-
-import java.util.Arrays;
-
-/**
- * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
- * Match is determined mostly by the input data adhering to the
- * encoding scheme for the charset, and, optionally,
- * frequency-of-occurence of characters.
- * <p/>
- * Instances of this class are singletons, one per encoding
- * being recognized. They are created in the main
- * CharsetDetector class and kept in the global list of available
- * encodings to be checked. The specific encoding being recognized
- * is determined by subclass.
- *
- * @internal
- */
-abstract class CharsetRecog_mbcs extends CharsetRecognizer {
-
- /**
- * Get the IANA name of this charset.
- *
- * @return the charset name.
- */
- abstract String getName();
-
-
- /**
- * Test the match of this charset with the input text data
- * which is obtained via the CharsetDetector object.
- *
- * @param det The CharsetDetector, which contains the input text
- * to be checked for being in this charset.
- * @return Two values packed into one int (Damn java, anyhow)
- * <br/>
- * bits 0-7: the match confidence, ranging from 0-100
- * <br/>
- * bits 8-15: The match reason, an enum-like value.
- */
- int match(CharsetDetector det, int[] commonChars) {
- int singleByteCharCount = 0;
- int doubleByteCharCount = 0;
- int commonCharCount = 0;
- int badCharCount = 0;
- int totalCharCount = 0;
- int confidence = 0;
- iteratedChar iter = new iteratedChar();
-
- detectBlock:
- {
- for (iter.reset(); nextChar(iter, det); ) {
- totalCharCount++;
- if (iter.error) {
- badCharCount++;
- } else {
- long cv = iter.charValue & 0xFFFFFFFFL;
-
- if (cv <= 0xff) {
- singleByteCharCount++;
- } else {
- doubleByteCharCount++;
- if (commonChars != null) {
- // NOTE: This assumes that there are no 4-byte common chars.
- if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
- commonCharCount++;
- }
- }
- }
- }
- if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
- // Bail out early if the byte data is not matching the encoding scheme.
- break detectBlock;
- }
- }
-
- if (doubleByteCharCount <= 10 && badCharCount == 0) {
- // Not many multi-byte chars.
- if (doubleByteCharCount == 0 && totalCharCount < 10) {
- // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
- // We don't have enough data to have any confidence.
- // Statistical analysis of single byte non-ASCII charcters would probably help here.
- confidence = 0;
- } else {
- // ASCII or ISO file? It's probably not our encoding,
- // but is not incompatible with our encoding, so don't give it a zero.
- confidence = 10;
- }
-
- break detectBlock;
- }
-
- //
- // No match if there are too many characters that don't fit the encoding scheme.
- // (should we have zero tolerance for these?)
- //
- if (doubleByteCharCount < 20 * badCharCount) {
- confidence = 0;
- break detectBlock;
- }
-
- if (commonChars == null) {
- // We have no statistics on frequently occuring characters.
- // Assess confidence purely on having a reasonable number of
- // multi-byte characters (the more the better
- confidence = 30 + doubleByteCharCount - 20 * badCharCount;
- if (confidence > 100) {
- confidence = 100;
- }
- } else {
- //
- // Frequency of occurence statistics exist.
- //
- double maxVal = Math.log((float) doubleByteCharCount / 4);
- double scaleFactor = 90.0 / maxVal;
- confidence = (int) (Math.log(commonCharCount + 1) * scaleFactor + 10);
- confidence = Math.min(confidence, 100);
- }
- } // end of detectBlock:
-
- return confidence;
- }
-
- /**
- * Get the next character (however many bytes it is) from the input data
- * Subclasses for specific charset encodings must implement this function
- * to get characters according to the rules of their encoding scheme.
- * <p/>
- * This function is not a method of class iteratedChar only because
- * that would require a lot of extra derived classes, which is awkward.
- *
- * @param it The iteratedChar "struct" into which the returned char is placed.
- * @param det The charset detector, which is needed to get at the input byte data
- * being iterated over.
- * @return True if a character was returned, false at end of input.
- */
- abstract boolean nextChar(iteratedChar it, CharsetDetector det);
-
- // "Character" iterated character class.
- // Recognizers for specific mbcs encodings make their "characters" available
- // by providing a nextChar() function that fills in an instance of iteratedChar
- // with the next char from the input.
- // The returned characters are not converted to Unicode, but remain as the raw
- // bytes (concatenated into an int) from the codepage data.
- //
- // For Asian charsets, use the raw input rather than the input that has been
- // stripped of markup. Detection only considers multi-byte chars, effectively
- // stripping markup anyway, and double byte chars do occur in markup too.
- //
- static class iteratedChar {
- int charValue = 0; // 1-4 bytes from the raw input data
- int index = 0;
- int nextIndex = 0;
- boolean error = false;
- boolean done = false;
-
- void reset() {
- charValue = 0;
- index = -1;
- nextIndex = 0;
- error = false;
- done = false;
- }
-
- int nextByte(CharsetDetector det) {
- if (nextIndex >= det.fRawLength) {
- done = true;
- return -1;
- }
- int byteValue = (int) det.fRawInput[nextIndex++] & 0x00ff;
- return byteValue;
- }
- }
-
- /**
- * Shift-JIS charset recognizer.
- */
- static class CharsetRecog_sjis extends CharsetRecog_mbcs {
- static int[] commonChars =
- // TODO: This set of data comes from the character frequency-
- // of-occurence analysis tool. The data needs to be moved
- // into a resource and loaded from there.
- {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
- 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
- 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
- 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
- 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
- 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
-
- boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
- it.error = false;
- int firstByte;
- firstByte = it.charValue = it.nextByte(det);
- if (firstByte < 0) {
- return false;
- }
-
- if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) {
- return true;
- }
-
- int secondByte = it.nextByte(det);
- if (secondByte < 0) {
- return false;
- }
- it.charValue = (firstByte << 8) | secondByte;
- if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
- // Illegal second byte value.
- it.error = true;
- }
- return true;
- }
-
- int match(CharsetDetector det) {
- return match(det, commonChars);
- }
-
- String getName() {
- return "Shift_JIS";
- }
-
- public String getLanguage() {
- return "ja";
- }
-
-
- }
-
-
- /**
- * Big5 charset recognizer.
- */
- static class CharsetRecog_big5 extends CharsetRecog_mbcs {
- static int[] commonChars =
- // TODO: This set of data comes from the character frequency-
- // of-occurence analysis tool. The data needs to be moved
- // into a resource and loaded from there.
- {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
- 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
- 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
- 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
- 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
- 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
- 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
- 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
- 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
- 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
-
- boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
- it.error = false;
- int firstByte;
- firstByte = it.charValue = it.nextByte(det);
- if (firstByte < 0) {
- return false;
- }
-
- if (firstByte <= 0x7f || firstByte == 0xff) {
- // single byte character.
- return true;
- }
-
- int secondByte = it.nextByte(det);
- if (secondByte < 0) {
- return false;
- }
- it.charValue = (it.charValue << 8) | secondByte;
-
- if (secondByte < 0x40 ||
- secondByte == 0x7f ||
- secondByte == 0xff) {
- it.error = true;
- }
- return true;
- }
-
- int match(CharsetDetector det) {
- return match(det, commonChars);
- }
-
- String getName() {
- return "Big5";
- }
-
-
- public String getLanguage() {
- return "zh";
- }
- }
-
-
- /**
- * EUC charset recognizers. One abstract class that provides the common function
- * for getting the next character according to the EUC encoding scheme,
- * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
- */
- abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
-
- /*
- * (non-Javadoc)
- * Get the next character value for EUC based encodings.
- * Character "value" is simply the raw bytes that make up the character
- * packed into an int.
- */
- boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
- it.error = false;
- int firstByte = 0;
- int secondByte = 0;
- int thirdByte = 0;
- //int fourthByte = 0;
-
- buildChar:
- {
- firstByte = it.charValue = it.nextByte(det);
- if (firstByte < 0) {
- // Ran off the end of the input data
- it.done = true;
- break buildChar;
- }
- if (firstByte <= 0x8d) {
- // single byte char
- break buildChar;
- }
-
- secondByte = it.nextByte(det);
- it.charValue = (it.charValue << 8) | secondByte;
-
- if (firstByte >= 0xA1 && firstByte <= 0xfe) {
- // Two byte Char
- if (secondByte < 0xa1) {
- it.error = true;
- }
- break buildChar;
- }
- if (firstByte == 0x8e) {
- // Code Set 2.
- // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
- // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
- // We don't know which we've got.
- // Treat it like EUC-JP. If the data really was EUC-TW, the following two
- // bytes will look like a well formed 2 byte char.
- if (secondByte < 0xa1) {
- it.error = true;
- }
- break buildChar;
- }
-
- if (firstByte == 0x8f) {
- // Code set 3.
- // Three byte total char size, two bytes of actual char value.
- thirdByte = it.nextByte(det);
- it.charValue = (it.charValue << 8) | thirdByte;
- if (thirdByte < 0xa1) {
- it.error = true;
- }
- }
- }
-
- return (it.done == false);
- }
-
- /**
- * The charset recognize for EUC-JP. A singleton instance of this class
- * is created and kept by the public CharsetDetector class
- */
- static class CharsetRecog_euc_jp extends CharsetRecog_euc {
- static int[] commonChars =
- // TODO: This set of data comes from the character frequency-
- // of-occurence analysis tool. The data needs to be moved
- // into a resource and loaded from there.
- {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
- 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
- 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
- 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
- 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
- 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
- 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
- 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
- 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
- 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
-
- String getName() {
- return "EUC-JP";
- }
-
- int match(CharsetDetector det) {
- return match(det, commonChars);
- }
-
- public String getLanguage() {
- return "ja";
- }
- }
-
- /**
- * The charset recognize for EUC-KR. A singleton instance of this class
- * is created and kept by the public CharsetDetector class
- */
- static class CharsetRecog_euc_kr extends CharsetRecog_euc {
- static int[] commonChars =
- // TODO: This set of data comes from the character frequency-
- // of-occurence analysis tool. The data needs to be moved
- // into a resource and loaded from there.
- {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
- 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
- 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
- 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
- 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
- 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
- 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
- 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
- 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
- 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
-
- String getName() {
- return "EUC-KR";
- }
-
- int match(CharsetDetector det) {
- return match(det, commonChars);
- }
-
- public String getLanguage() {
- return "ko";
- }
- }
- }
-
- /**
- * GB-18030 recognizer. Uses simplified Chinese statistics.
- */
- static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
-
- static int[] commonChars =
- // TODO: This set of data comes from the character frequency-
- // of-occurence analysis tool. The data needs to be moved
- // into a resource and loaded from there.
- {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
- 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
- 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
- 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
- 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
- 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
- 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
- 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
- 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
- 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
-
- /*
- * (non-Javadoc)
- * Get the next character value for EUC based encodings.
- * Character "value" is simply the raw bytes that make up the character
- * packed into an int.
- */
- boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
- it.error = false;
- int firstByte = 0;
- int secondByte = 0;
- int thirdByte = 0;
- int fourthByte = 0;
-
- buildChar:
- {
- firstByte = it.charValue = it.nextByte(det);
-
- if (firstByte < 0) {
- // Ran off the end of the input data
- it.done = true;
- break buildChar;
- }
-
- if (firstByte <= 0x80) {
- // single byte char
- break buildChar;
- }
-
- secondByte = it.nextByte(det);
- it.charValue = (it.charValue << 8) | secondByte;
-
- if (firstByte >= 0x81 && firstByte <= 0xFE) {
- // Two byte Char
- if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) {
- break buildChar;
- }
-
- // Four byte char
- if (secondByte >= 0x30 && secondByte <= 0x39) {
- thirdByte = it.nextByte(det);
-
- if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
- fourthByte = it.nextByte(det);
-
- if (fourthByte >= 0x30 && fourthByte <= 0x39) {
- it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
- break buildChar;
- }
- }
- }
-
- it.error = true;
- break buildChar;
- }
- }
-
- return (it.done == false);
- }
-
- String getName() {
- return "GB18030";
- }
-
- int match(CharsetDetector det) {
- return match(det, commonChars);
- }
-
- public String getLanguage() {
- return "zh";
- }
- }
-
-
-}
+/*
+ ****************************************************************************
+ * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ ****************************************************************************
+ *
+ */
+package org.apache.tika.parser.txt;
+
+import java.util.Arrays;
+
+/**
+ * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
+ * Match is determined mostly by the input data adhering to the
+ * encoding scheme for the charset, and, optionally,
+ * frequency-of-occurence of characters.
+ * <p/>
+ * Instances of this class are singletons, one per encoding
+ * being recognized. They are created in the main
+ * CharsetDetector class and kept in the global list of available
+ * encodings to be checked. The specific encoding being recognized
+ * is determined by subclass.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_mbcs extends CharsetRecognizer {
+
+ /**
+ * Get the IANA name of this charset.
+ *
+ * @return the charset name.
+ */
+ abstract String getName();
+
+
+ /**
+ * Test the match of this charset with the input text data
+ * which is obtained via the CharsetDetector object.
+ *
+ * @param det The CharsetDetector, which contains the input text
+ * to be checked for being in this charset.
+ * @return Two values packed into one int (Damn java, anyhow)
+ * <br/>
+ * bits 0-7: the match confidence, ranging from 0-100
+ * <br/>
+ * bits 8-15: The match reason, an enum-like value.
+ */
+ int match(CharsetDetector det, int[] commonChars) {
+ int singleByteCharCount = 0;
+ int doubleByteCharCount = 0;
+ int commonCharCount = 0;
+ int badCharCount = 0;
+ int totalCharCount = 0;
+ int confidence = 0;
+ iteratedChar iter = new iteratedChar();
+
+ detectBlock:
+ {
+ for (iter.reset(); nextChar(iter, det); ) {
+ totalCharCount++;
+ if (iter.error) {
+ badCharCount++;
+ } else {
+ long cv = iter.charValue & 0xFFFFFFFFL;
+
+ if (cv <= 0xff) {
+ singleByteCharCount++;
+ } else {
+ doubleByteCharCount++;
+ if (commonChars != null) {
+ // NOTE: This assumes that there are no 4-byte common chars.
+ if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
+ commonCharCount++;
+ }
+ }
+ }
+ }
+ if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
+ // Bail out early if the byte data is not matching the encoding scheme.
+ break detectBlock;
+ }
+ }
+
+ if (doubleByteCharCount <= 10 && badCharCount == 0) {
+ // Not many multi-byte chars.
+ if (doubleByteCharCount == 0 && totalCharCount < 10) {
+ // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
+ // We don't have enough data to have any confidence.
+ // Statistical analysis of single byte non-ASCII charcters would probably help here.
+ confidence = 0;
+ } else {
+ // ASCII or ISO file? It's probably not our encoding,
+ // but is not incompatible with our encoding, so don't give it a zero.
+ confidence = 10;
+ }
+
+ break detectBlock;
+ }
+
+ //
+ // No match if there are too many characters that don't fit the encoding scheme.
+ // (should we have zero tolerance for these?)
+ //
+ if (doubleByteCharCount < 20 * badCharCount) {
+ confidence = 0;
+ break detectBlock;
+ }
+
+ if (commonChars == null) {
+ // We have no statistics on frequently occuring characters.
+ // Assess confidence purely on having a reasonable number of
+ // multi-byte characters (the more the better
+ confidence = 30 + doubleByteCharCount - 20 * badCharCount;
+ if (confidence > 100) {
+ confidence = 100;
+ }
+ } else {
+ //
+ // Frequency of occurence statistics exist.
+ //
+ double maxVal = Math.log((float) doubleByteCharCount / 4);
+ double scaleFactor = 90.0 / maxVal;
+ confidence = (int) (Math.log(commonCharCount + 1) * scaleFactor + 10);
+ confidence = Math.min(confidence, 100);
+ }
+ } // end of detectBlock:
+
+ return confidence;
+ }
+
+ /**
+ * Get the next character (however many bytes it is) from the input data
+ * Subclasses for specific charset encodings must implement this function
+ * to get characters according to the rules of their encoding scheme.
+ * <p/>
+ * This function is not a method of class iteratedChar only because
+ * that would require a lot of extra derived classes, which is awkward.
+ *
+ * @param it The iteratedChar "struct" into which the returned char is placed.
+ * @param det The charset detector, which is needed to get at the input byte data
+ * being iterated over.
+ * @return True if a character was returned, false at end of input.
+ */
+ abstract boolean nextChar(iteratedChar it, CharsetDetector det);
+
+ // "Character" iterated character class.
+ // Recognizers for specific mbcs encodings make their "characters" available
+ // by providing a nextChar() function that fills in an instance of iteratedChar
+ // with the next char from the input.
+ // The returned characters are not converted to Unicode, but remain as the raw
+ // bytes (concatenated into an int) from the codepage data.
+ //
+ // For Asian charsets, use the raw input rather than the input that has been
+ // stripped of markup. Detection only considers multi-byte chars, effectively
+ // stripping markup anyway, and double byte chars do occur in markup too.
+ //
+ static class iteratedChar {
+ int charValue = 0; // 1-4 bytes from the raw input data
+ int index = 0;
+ int nextIndex = 0;
+ boolean error = false;
+ boolean done = false;
+
+ void reset() {
+ charValue = 0;
+ index = -1;
+ nextIndex = 0;
+ error = false;
+ done = false;
+ }
+
+ int nextByte(CharsetDetector det) {
+ if (nextIndex >= det.fRawLength) {
+ done = true;
+ return -1;
+ }
+ int byteValue = (int) det.fRawInput[nextIndex++] & 0x00ff;
+ return byteValue;
+ }
+ }
+
+ /**
+ * Shift-JIS charset recognizer.
+ */
+ static class CharsetRecog_sjis extends CharsetRecog_mbcs {
+ static int[] commonChars =
+ // TODO: This set of data comes from the character frequency-
+ // of-occurence analysis tool. The data needs to be moved
+ // into a resource and loaded from there.
+ {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
+ 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
+ 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
+ 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
+ 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
+ 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
+
+ boolean nextChar(iteratedChar it, CharsetDetector det) {
+ it.index = it.nextIndex;
+ it.error = false;
+ int firstByte;
+ firstByte = it.charValue = it.nextByte(det);
+ if (firstByte < 0) {
+ return false;
+ }
+
+ if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) {
+ return true;
+ }
+
+ int secondByte = it.nextByte(det);
+ if (secondByte < 0) {
+ return false;
+ }
+ it.charValue = (firstByte << 8) | secondByte;
+ if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
+ // Illegal second byte value.
+ it.error = true;
+ }
+ return true;
+ }
+
+ int match(CharsetDetector det) {
+ return match(det, commonChars);
+ }
+
+ String getName() {
+ return "Shift_JIS";
+ }
+
+ public String getLanguage() {
+ return "ja";
+ }
+
+
+ }
+
+
+ /**
+ * Big5 charset recognizer.
+ */
+ static class CharsetRecog_big5 extends CharsetRecog_mbcs {
+ static int[] commonChars =
+ // TODO: This set of data comes from the character frequency-
+ // of-occurence analysis tool. The data needs to be moved
+ // into a resource and loaded from there.
+ {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
+ 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
+ 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
+ 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
+ 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
+ 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
+ 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
+ 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
+ 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
+ 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
+
+ boolean nextChar(iteratedChar it, CharsetDetector det) {
+ it.index = it.nextIndex;
+ it.error = false;
+ int firstByte;
+ firstByte = it.charValue = it.nextByte(det);
+ if (firstByte < 0) {
+ return false;
+ }
+
+ if (firstByte <= 0x7f || firstByte == 0xff) {
+ // single byte character.
+ return true;
+ }
+
+ int secondByte = it.nextByte(det);
+ if (secondByte < 0) {
+ return false;
+ }
+ it.charValue = (it.charValue << 8) | secondByte;
+
+ if (secondByte < 0x40 ||
+ secondByte == 0x7f ||
+ secondByte == 0xff) {
+ it.error = true;
+ }
+ return true;
+ }
+
+ int match(CharsetDetector det) {
+ return match(det, commonChars);
+ }
+
+ String getName() {
+ return "Big5";
+ }
+
+
+ public String getLanguage() {
+ return "zh";
+ }
+ }
+
+
+ /**
+ * EUC charset recognizers. One abstract class that provides the common function
+ * for getting the next character according to the EUC encoding scheme,
+ * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
+ */
+ abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
+
+ /*
+ * (non-Javadoc)
+ * Get the next character value for EUC based encodings.
+ * Character "value" is simply the raw bytes that make up the character
+ * packed into an int.
+ */
+ boolean nextChar(iteratedChar it, CharsetDetector det) {
+ it.index = it.nextIndex;
+ it.error = false;
+ int firstByte = 0;
+ int secondByte = 0;
+ int thirdByte = 0;
+ //int fourthByte = 0;
+
+ buildChar:
+ {
+ firstByte = it.charValue = it.nextByte(det);
+ if (firstByte < 0) {
+ // Ran off the end of the input data
+ it.done = true;
+ break buildChar;
+ }
+ if (firstByte <= 0x8d) {
+ // single byte char
+ break buildChar;
+ }
+
+ secondByte = it.nextByte(det);
+ it.charValue = (it.charValue << 8) | secondByte;
+
+ if (firstByte >= 0xA1 && firstByte <= 0xfe) {
+ // Two byte Char
+ if (secondByte < 0xa1) {
+ it.error = true;
+ }
+ break buildChar;
+ }
+ if (firstByte == 0x8e) {
+ // Code Set 2.
+ // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
+ // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
+ // We don't know which we've got.
+ // Treat it like EUC-JP. If the data really was EUC-TW, the following two
+ // bytes will look like a well formed 2 byte char.
+ if (secondByte < 0xa1) {
+ it.error = true;
+ }
+ break buildChar;
+ }
+
+ if (firstByte == 0x8f) {
+ // Code set 3.
+ // Three byte total char size, two bytes of actual char value.
+ thirdByte = it.nextByte(det);
+ it.charValue = (it.charValue << 8) | thirdByte;
+ if (thirdByte < 0xa1) {
+ it.error = true;
+ }
+ }
+ }
+
+ return (it.done == false);
+ }
+
+ /**
+ * The charset recognize for EUC-JP. A singleton instance of this class
+ * is created and kept by the public CharsetDetector class
+ */
+ static class CharsetRecog_euc_jp extends CharsetRecog_euc {
+ static int[] commonChars =
+ // TODO: This set of data comes from the character frequency-
+ // of-occurence analysis tool. The data needs to be moved
+ // into a resource and loaded from there.
+ {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
+ 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
+ 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
+ 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
+ 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
+ 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
+ 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
+ 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
+ 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
+ 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
+
+ String getName() {
+ return "EUC-JP";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det, commonChars);
+ }
+
+ public String getLanguage() {
+ return "ja";
+ }
+ }
+
+ /**
+ * The charset recognize for EUC-KR. A singleton instance of this class
+ * is created and kept by the public CharsetDetector class
+ */
+ static class CharsetRecog_euc_kr extends CharsetRecog_euc {
+ static int[] commonChars =
+ // TODO: This set of data comes from the character frequency-
+ // of-occurence analysis tool. The data needs to be moved
+ // into a resource and loaded from there.
+ {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
+ 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
+ 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
+ 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
+ 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
+ 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
+ 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
+ 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
+ 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
+ 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
+
+ String getName() {
+ return "EUC-KR";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det, commonChars);
+ }
+
+ public String getLanguage() {
+ return "ko";
+ }
+ }
+ }
+
+ /**
+ * GB-18030 recognizer. Uses simplified Chinese statistics.
+ */
+ static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
+
+ static int[] commonChars =
+ // TODO: This set of data comes from the character frequency-
+ // of-occurence analysis tool. The data needs to be moved
+ // into a resource and loaded from there.
+ {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
+ 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
+ 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
+ 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
+ 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
+ 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
+ 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
+ 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
+ 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
+ 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
+
+ /*
+ * (non-Javadoc)
+ * Get the next character value for EUC based encodings.
+ * Character "value" is simply the raw bytes that make up the character
+ * packed into an int.
+ */
+ boolean nextChar(iteratedChar it, CharsetDetector det) {
+ it.index = it.nextIndex;
+ it.error = false;
+ int firstByte = 0;
+ int secondByte = 0;
+ int thirdByte = 0;
+ int fourthByte = 0;
+
+ buildChar:
+ {
+ firstByte = it.charValue = it.nextByte(det);
+
+ if (firstByte < 0) {
+ // Ran off the end of the input data
+ it.done = true;
+ break buildChar;
+ }
+
+ if (firstByte <= 0x80) {
+ // single byte char
+ break buildChar;
+ }
+
+ secondByte = it.nextByte(det);
+ it.charValue = (it.charValue << 8) | secondByte;
+
+ if (firstByte >= 0x81 && firstByte <= 0xFE) {
+ // Two byte Char
+ if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) {
+ break buildChar;
+ }
+
+ // Four byte char
+ if (secondByte >= 0x30 && secondByte <= 0x39) {
+ thirdByte = it.nextByte(det);
+
+ if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
+ fourthByte = it.nextByte(det);
+
+ if (fourthByte >= 0x30 && fourthByte <= 0x39) {
+ it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
+ break buildChar;
+ }
+ }
+ }
+
+ it.error = true;
+ break buildChar;
+ }
+ }
+
+ return (it.done == false);
+ }
+
+ String getName() {
+ return "GB18030";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det, commonChars);
+ }
+
+ public String getLanguage() {
+ return "zh";
+ }
+ }
+
+
+}
[21/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
index cc17459..a8fe200 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java
@@ -1,155 +1,155 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.opc;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Locale;
-import java.util.regex.Pattern;
-
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-
-/**
- * Detector that detects OPC Packages
- *
- */
-public class OPCDetector implements Detector {
-
- /**
- *
- */
- private static final long serialVersionUID = -3569622763024617244L;
-
- private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
-
- // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
- private static final String VISIO_DOCUMENT =
- "http://schemas.microsoft.com/visio/2010/relationships/document";
-
- // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
- private static final String STRICT_CORE_DOCUMENT =
- "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
-
- @Override
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream stream = TikaInputStream.get(input, tmp);
- // Use POI to open and investigate it for us
- OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
- stream.setOpenContainer(pkg);
-
- // Is at an OOXML format?
- MediaType type = detectOfficeOpenXML(pkg);
- if (type != null) return type;
-
- // Is it XPS format?
- type = detectXPSOPC(pkg);
- if (type != null) return type;
-
- // Is it an AutoCAD format?
- type = detectAutoCADOPC(pkg);
-
- return type;
- } catch (InvalidFormatException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }finally {
- tmp.close();
- }
- return null;
- }
-
- /**
- * Detects the type of an OfficeOpenXML (OOXML) file from
- * opened Package
- */
- public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
- // Check for the normal Office core document
- PackageRelationshipCollection core =
- pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
- // Otherwise check for some other Office core document types
- if (core.size() == 0) {
- core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
- }
- if (core.size() == 0) {
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
- }
-
- // If we didn't find a single core document of any type, skip detection
- if (core.size() != 1) {
- // Invalid OOXML Package received
- return null;
- }
-
- // Get the type of the core document part
- PackagePart corePart = pkg.getPart(core.getRelationship(0));
- String coreType = corePart.getContentType();
-
- // Turn that into the type of the overall document
- String docType = coreType.substring(0, coreType.lastIndexOf('.'));
-
- // The Macro Enabled formats are a little special
- if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
- docType = docType.toLowerCase(Locale.ROOT) + ".12";
- }
-
- if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
- docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
- }
-
- // Build the MediaType object and return
- return MediaType.parse(docType);
- }
- /**
- * Detects Open XML Paper Specification (XPS)
- */
- private static MediaType detectXPSOPC(OPCPackage pkg) {
- PackageRelationshipCollection xps =
- pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
- if (xps.size() == 1) {
- return MediaType.application("vnd.ms-xpsdocument");
- } else {
- // Non-XPS Package received
- return null;
- }
- }
- /**
- * Detects AutoCAD formats that live in OPC packaging
- */
- private static MediaType detectAutoCADOPC(OPCPackage pkg) {
- PackageRelationshipCollection dwfxSeq =
- pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
- if (dwfxSeq.size() == 1) {
- return MediaType.parse("model/vnd.dwfx+xps");
- } else {
- // Non-AutoCAD Package received
- return null;
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector that detects OPC Packages
+ *
+ */
+public class OPCDetector implements Detector {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -3569622763024617244L;
+
+ private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String VISIO_DOCUMENT =
+ "http://schemas.microsoft.com/visio/2010/relationships/document";
+
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
+ private static final String STRICT_CORE_DOCUMENT =
+ "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream stream = TikaInputStream.get(input, tmp);
+ // Use POI to open and investigate it for us
+ OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
+ stream.setOpenContainer(pkg);
+
+ // Is at an OOXML format?
+ MediaType type = detectOfficeOpenXML(pkg);
+ if (type != null) return type;
+
+ // Is it XPS format?
+ type = detectXPSOPC(pkg);
+ if (type != null) return type;
+
+ // Is it an AutoCAD format?
+ type = detectAutoCADOPC(pkg);
+
+ return type;
+ } catch (InvalidFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }finally {
+ tmp.close();
+ }
+ return null;
+ }
+
+ /**
+ * Detects the type of an OfficeOpenXML (OOXML) file from
+ * opened Package
+ */
+ public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+ // Check for the normal Office core document
+ PackageRelationshipCollection core =
+ pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT);
+ // Otherwise check for some other Office core document types
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT);
+ }
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
+ }
+
+ // If we didn't find a single core document of any type, skip detection
+ if (core.size() != 1) {
+ // Invalid OOXML Package received
+ return null;
+ }
+
+ // Get the type of the core document part
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+ String coreType = corePart.getContentType();
+
+ // Turn that into the type of the overall document
+ String docType = coreType.substring(0, coreType.lastIndexOf('.'));
+
+ // The Macro Enabled formats are a little special
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+ docType = docType.toLowerCase(Locale.ROOT) + ".12";
+ }
+
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
+ docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
+ }
+
+ // Build the MediaType object and return
+ return MediaType.parse(docType);
+ }
+ /**
+ * Detects Open XML Paper Specification (XPS)
+ */
+ private static MediaType detectXPSOPC(OPCPackage pkg) {
+ PackageRelationshipCollection xps =
+ pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
+ if (xps.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ } else {
+ // Non-XPS Package received
+ return null;
+ }
+ }
+ /**
+ * Detects AutoCAD formats that live in OPC packaging
+ */
+ private static MediaType detectAutoCADOPC(OPCPackage pkg) {
+ PackageRelationshipCollection dwfxSeq =
+ pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+ if (dwfxSeq.size() == 1) {
+ return MediaType.parse("model/vnd.dwfx+xps");
+ } else {
+ // Non-AutoCAD Package received
+ return null;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
index e5beb4b..90f2d2e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
@@ -1,28 +1,28 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.opendocument;
-
-import org.apache.tika.parser.odf.OpenDocumentParser;
-
-/**
- * OpenOffice parser
- *
- * @deprecated Use the {@link OpenDocumentParser} class instead.
- * This class will be removed in Apache Tika 1.0.
- */
-public class OpenOfficeParser extends OpenDocumentParser {
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import org.apache.tika.parser.odf.OpenDocumentParser;
+
+/**
+ * OpenOffice parser
+ *
+ * @deprecated Use the {@link OpenDocumentParser} class instead.
+ * This class will be removed in Apache Tika 1.0.
+ */
+public class OpenOfficeParser extends OpenDocumentParser {
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
index eba9d8c..4a9a1d1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
@@ -1,67 +1,67 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import java.nio.charset.Charset;
-
-/* Holds all state associated with current RTF group, ie {
- * ... }. */
-
-class GroupState {
- public int depth;
- public boolean bold;
- public boolean italic;
- // True if we are skipping all text in current group,
- // eg if group leads with a \*:
- public boolean ignore;
- // Default is 1 if no uc control has been seen yet:
- public int ucSkip = 1;
- public int list;
- public int listLevel;
- public Charset fontCharset;
- //in objdata
- public boolean objdata;
- //depth in pict, 1 = at pict level
- public int pictDepth;
- //in picprop key/value pair
- public boolean sp;
- //in picprop's name
- public boolean sn;
- //in picprop's value
- public boolean sv;
- //in embedded object or not
- public boolean object;
-
- // Create default (root) GroupState
- public GroupState() {
- }
-
- // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
- public GroupState(GroupState other) {
- bold = other.bold;
- italic = other.italic;
- ignore = other.ignore;
- ucSkip = other.ucSkip;
- list = other.list;
- listLevel = other.listLevel;
- fontCharset = other.fontCharset;
- depth = 1 + other.depth;
- pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
- //do not inherit object, sn, sv or sp
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.nio.charset.Charset;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+ public int depth;
+ public boolean bold;
+ public boolean italic;
+ // True if we are skipping all text in current group,
+ // eg if group leads with a \*:
+ public boolean ignore;
+ // Default is 1 if no uc control has been seen yet:
+ public int ucSkip = 1;
+ public int list;
+ public int listLevel;
+ public Charset fontCharset;
+ //in objdata
+ public boolean objdata;
+ //depth in pict, 1 = at pict level
+ public int pictDepth;
+ //in picprop key/value pair
+ public boolean sp;
+ //in picprop's name
+ public boolean sn;
+ //in picprop's value
+ public boolean sv;
+ //in embedded object or not
+ public boolean object;
+
+ // Create default (root) GroupState
+ public GroupState() {
+ }
+
+ // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+ public GroupState(GroupState other) {
+ bold = other.bold;
+ italic = other.italic;
+ ignore = other.ignore;
+ ucSkip = other.ucSkip;
+ list = other.list;
+ listLevel = other.listLevel;
+ fontCharset = other.fontCharset;
+ depth = 1 + other.depth;
+ pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
+ //do not inherit object, sn, sv or sp
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
index 1931232..e7142bd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
@@ -1,35 +1,35 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-/**
- * Contains the information for a single list in the list or list override tables.
- */
-public class ListDescriptor {
- public final static int NUMBER_TYPE_BULLET = 23;
-
- public int id;
- // We record this but don't make use if it today:
- public int templateID;
- // We record this but don't make use if it today:
- public boolean isStyle;
- public int[] numberType = new int[9];
-
- public boolean isUnordered(int level) {
- return numberType[level] == NUMBER_TYPE_BULLET;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/**
+ * Contains the information for a single list in the list or list override tables.
+ */
+public class ListDescriptor {
+ public final static int NUMBER_TYPE_BULLET = 23;
+
+ public int id;
+ // We record this but don't make use if it today:
+ public int templateID;
+ // We record this but don't make use if it today:
+ public boolean isStyle;
+ public int[] numberType = new int[9];
+
+ public boolean isUnordered(int level) {
+ return numberType[level] == NUMBER_TYPE_BULLET;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index ccd7e7f..d2c448b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.TaggedInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * RTF parser
- */
-public class RTFParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -4165069489372320313L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("rtf"));
- /**
- * maximum number of bytes per embedded object/pict (default: 20MB)
- */
- private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
-
- /**
- * See {@link #setMaxBytesForEmbeddedObject(int)}.
- *
- * @return maximum number of bytes allowed for an embedded object.
- */
- public static int getMaxBytesForEmbeddedObject() {
- return EMB_OBJ_MAX_BYTES;
- }
-
- /**
- * Bytes for embedded objects are currently cached in memory.
- * If something goes wrong during the parsing of an embedded object,
- * it is possible that a read length may be crazily too long
- * and cause a heap crash.
- *
- * @param max maximum number of bytes to allow for embedded objects. If
- * the embedded object has more than this number of bytes, skip it.
- */
- public static void setMaxBytesForEmbeddedObject(int max) {
- EMB_OBJ_MAX_BYTES = max;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
- TaggedInputStream tagged = new TaggedInputStream(stream);
- try {
- XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
- RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
- final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
- ert.extract(stream);
- } catch (IOException e) {
- tagged.throwIfCauseOf(e);
- throw new TikaException("Error parsing an RTF document", e);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * RTF parser
+ */
+public class RTFParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -4165069489372320313L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("rtf"));
+ /**
+ * maximum number of bytes per embedded object/pict (default: 20MB)
+ */
+ private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
+
+ /**
+ * See {@link #setMaxBytesForEmbeddedObject(int)}.
+ *
+ * @return maximum number of bytes allowed for an embedded object.
+ */
+ public static int getMaxBytesForEmbeddedObject() {
+ return EMB_OBJ_MAX_BYTES;
+ }
+
+ /**
+ * Bytes for embedded objects are currently cached in memory.
+ * If something goes wrong during the parsing of an embedded object,
+ * it is possible that a read length may be crazily too long
+ * and cause a heap crash.
+ *
+ * @param max maximum number of bytes to allow for embedded objects. If
+ * the embedded object has more than this number of bytes, skip it.
+ */
+ public static void setMaxBytesForEmbeddedObject(int max) {
+ EMB_OBJ_MAX_BYTES = max;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
+ TaggedInputStream tagged = new TaggedInputStream(stream);
+ try {
+ XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
+ RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+ final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+ ert.extract(stream);
+ } catch (IOException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("Error parsing an RTF document", e);
+ }
+ }
+}
[35/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
index 74c3360..3a8a66c 100644
--- a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
+++ b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-/**
- * Test case for parsing Java class files.
- */
-public class ClassParserTest {
-
- @Test
- public void testClassParsing() throws Exception {
- String path = "/test-documents/AutoDetectParser.class";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- ClassParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
- assertEquals(
- "AutoDetectParser.class",
- metadata.get(Metadata.RESOURCE_NAME_KEY));
-
- assertTrue(content.contains("package org.apache.tika.parser;"));
- assertTrue(content.contains(
- "class AutoDetectParser extends CompositeParser"));
- assertTrue(content.contains(
- "private org.apache.tika.mime.MimeTypes types"));
- assertTrue(content.contains(
- "public void parse("
- + "java.io.InputStream, org.xml.sax.ContentHandler,"
- + " org.apache.tika.metadata.Metadata) throws"
- + " java.io.IOException, org.xml.sax.SAXException,"
- + " org.apache.tika.exception.TikaException;"));
- assertTrue(content.contains(
- "private byte[] getPrefix(java.io.InputStream, int)"
- + " throws java.io.IOException;"));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+/**
+ * Test case for parsing Java class files.
+ */
+public class ClassParserTest {
+
+ @Test
+ public void testClassParsing() throws Exception {
+ String path = "/test-documents/AutoDetectParser.class";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ ClassParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(
+ "AutoDetectParser.class",
+ metadata.get(Metadata.RESOURCE_NAME_KEY));
+
+ assertTrue(content.contains("package org.apache.tika.parser;"));
+ assertTrue(content.contains(
+ "class AutoDetectParser extends CompositeParser"));
+ assertTrue(content.contains(
+ "private org.apache.tika.mime.MimeTypes types"));
+ assertTrue(content.contains(
+ "public void parse("
+ + "java.io.InputStream, org.xml.sax.ContentHandler,"
+ + " org.apache.tika.metadata.Metadata) throws"
+ + " java.io.IOException, org.xml.sax.SAXException,"
+ + " org.apache.tika.exception.TikaException;"));
+ assertTrue(content.contains(
+ "private byte[] getPrefix(java.io.InputStream, int)"
+ + " throws java.io.IOException;"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index ae762dc..17aca8b 100644
--- a/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parser-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -1,101 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.code;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.util.Set;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.junit.Test;
-
-public class SourceCodeParserTest extends TikaTest {
-
- private SourceCodeParser sourceCodeParser = new SourceCodeParser();
-
- @Test
- public void testSupportTypes() throws Exception {
- Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
- assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
- assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
- assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
-
- assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
- }
-
- @Test
- public void testHTMLRenderWithReturnLine() throws Exception {
- String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
-
- assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
- assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
- assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
- assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
- }
-
- @Test
- public void testTextRender() throws Exception {
- String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
-
- assertTrue(textContent.length() > 0);
- assertTrue(textContent.indexOf("html") < 0);
-
- textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
- assertTrue(textContent.length() > 0);
- assertTrue(textContent.indexOf("html") < 0);
- }
-
- @Test
- public void testLoC() throws Exception {
- Metadata metadata = createMetadata("text/x-groovy");
- getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
-
- assertEquals(metadata.get("LoC"), "9");
- }
-
- @Test
- public void testAuthor() throws Exception {
- Metadata metadata = createMetadata("text/x-c++src");
- getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
-
- assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
- }
-
- @Test
- public void testReturnContentAsIsForTextHandler() throws Exception {
- String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
-
- assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
- }
-
- private Metadata createMetadata(String mimeType) {
- Metadata metadata = new Metadata();
- metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
- metadata.add(Metadata.CONTENT_TYPE, mimeType);
- return metadata;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class SourceCodeParserTest extends TikaTest {
+
+ private SourceCodeParser sourceCodeParser = new SourceCodeParser();
+
+ @Test
+ public void testSupportTypes() throws Exception {
+ Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
+
+ assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
+ }
+
+ @Test
+ public void testHTMLRenderWithReturnLine() throws Exception {
+ String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
+
+ assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+ assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+ }
+
+ @Test
+ public void testTextRender() throws Exception {
+ String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
+
+ textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
+ }
+
+ @Test
+ public void testLoC() throws Exception {
+ Metadata metadata = createMetadata("text/x-groovy");
+ getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
+
+ assertEquals(metadata.get("LoC"), "9");
+ }
+
+ @Test
+ public void testAuthor() throws Exception {
+ Metadata metadata = createMetadata("text/x-c++src");
+ getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+
+ assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
+ }
+
+ @Test
+ public void testReturnContentAsIsForTextHandler() throws Exception {
+ String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+
+ assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
+ }
+
+ private Metadata createMetadata(String mimeType) {
+ Metadata metadata = new Metadata();
+ metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
+ metadata.add(Metadata.CONTENT_TYPE, mimeType);
+ return metadata;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/pom.xml b/tika-parser-modules/tika-parser-crypto-module/pom.xml
index 69dd7a9..23a5417 100644
--- a/tika-parser-modules/tika-parser-crypto-module/pom.xml
+++ b/tika-parser-modules/tika-parser-crypto-module/pom.xml
@@ -1,53 +1,53 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-crypto-module</artifactId>
- <name>Apache Tika parser crypto module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcmail-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-crypto-module</artifactId>
+ <name>Apache Tika parser crypto module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java b/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
index f71fb51..e63c276 100644
--- a/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/module/crypto/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.crypto.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.crypto.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java b/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
index 02ff3dd..bc39042 100644
--- a/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
+++ b/tika-parser-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
@@ -1,47 +1,47 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.crypto;
-
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class Pkcs7ParserTest extends TikaTest {
- @Test
- public void testDetachedSignature() throws Exception {
- try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
- "/test-documents/testDetached.p7s")) {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
- } catch (NullPointerException npe) {
- fail("should not get NPE");
- } catch (TikaException te) {
- assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
- }
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.crypto;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class Pkcs7ParserTest extends TikaTest {
+ @Test
+ public void testDetachedSignature() throws Exception {
+ try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
+ "/test-documents/testDetached.p7s")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
+ } catch (NullPointerException npe) {
+ fail("should not get NPE");
+ } catch (TikaException te) {
+ assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-database-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/pom.xml b/tika-parser-modules/tika-parser-database-module/pom.xml
index a60dae3..cdbbaad 100644
--- a/tika-parser-modules/tika-parser-database-module/pom.xml
+++ b/tika-parser-modules/tika-parser-database-module/pom.xml
@@ -1,67 +1,67 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-database-module</artifactId>
- <name>Apache Tika parser database module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <!-- Provided dependencies -->
- <dependency>
- <groupId>org.xerial</groupId>
- <artifactId>sqlite-jdbc</artifactId>
- <version>3.8.11.2</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-database-module</artifactId>
+ <name>Apache Tika parser database module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <!-- Provided dependencies -->
+ <dependency>
+ <groupId>org.xerial</groupId>
+ <artifactId>sqlite-jdbc</artifactId>
+ <version>3.8.11.2</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
index 4b798fa..e66cab3 100644
--- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/module/database/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.database.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.database.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/pom.xml b/tika-parser-modules/tika-parser-ebook-module/pom.xml
index 89bab53..0c21ee9 100644
--- a/tika-parser-modules/tika-parser-ebook-module/pom.xml
+++ b/tika-parser-modules/tika-parser-ebook-module/pom.xml
@@ -1,48 +1,48 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-ebook-module</artifactId>
- <name>Apache Tika parser e-Book module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-ebook-module</artifactId>
+ <name>Apache Tika parser e-Book module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
index 62e1582..313de08 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/module/ebook/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.ebook.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.ebook.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
index ab55e5e..94b5caa 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
@@ -1,59 +1,59 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import javax.xml.parsers.SAXParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for EPUB OPS <code>*.html</code> files.
- *
- * For the time being, assume XHTML (TODO: DTBook)
- */
-public class EpubContentParser extends AbstractParser {
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.emptySet(); // not a top-level parser
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- final XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- SAXParser parser = context.getSAXParser();
- parser.parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(xhtml));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import javax.xml.parsers.SAXParser;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for EPUB OPS <code>*.html</code> files.
+ *
+ * For the time being, assume XHTML (TODO: DTBook)
+ */
+public class EpubContentParser extends AbstractParser {
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ SAXParser parser = context.getSAXParser();
+ parser.parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(xhtml));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 14e6cf8..c4f72de 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -1,119 +1,119 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.xml.DcXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Epub parser
- */
-public class EpubParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 215176772484050550L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("epub+zip"),
- MediaType.application("x-ibooks+zip")
- )));
-
- private Parser meta = new DcXMLParser();
-
- private Parser content = new EpubContentParser();
-
- public Parser getMetaParser() {
- return meta;
- }
-
- public void setMetaParser(Parser meta) {
- this.meta = meta;
- }
-
- public Parser getContentParser() {
- return content;
- }
-
- public void setContentParser(Parser content) {
- this.content = content;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Because an EPub file is often made up of multiple XHTML files,
- // we need explicit control over the start and end of the document
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- ContentHandler childHandler = new EmbeddedContentHandler(
- new BodyContentHandler(xhtml));
-
- ZipInputStream zip = new ZipInputStream(stream);
- ZipEntry entry = zip.getNextEntry();
- while (entry != null) {
- if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, UTF_8);
- //often has trailing new lines
- if (type != null) {
- type = type.trim();
- }
- metadata.set(Metadata.CONTENT_TYPE, type);
- } else if (entry.getName().equals("metadata.xml")) {
- meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith(".opf")) {
- meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith(".html") ||
- entry.getName().endsWith(".xhtml")) {
- content.parse(zip, childHandler, metadata, context);
- }
- entry = zip.getNextEntry();
- }
-
- // Finish everything
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.xml.DcXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Epub parser
+ */
+public class EpubParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 215176772484050550L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("epub+zip"),
+ MediaType.application("x-ibooks+zip")
+ )));
+
+ private Parser meta = new DcXMLParser();
+
+ private Parser content = new EpubContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Because an EPub file is often made up of multiple XHTML files,
+ // we need explicit control over the start and end of the document
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ ContentHandler childHandler = new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml));
+
+ ZipInputStream zip = new ZipInputStream(stream);
+ ZipEntry entry = zip.getNextEntry();
+ while (entry != null) {
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ //often has trailing new lines
+ if (type != null) {
+ type = type.trim();
+ }
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals("metadata.xml")) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith(".opf")) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith(".html") ||
+ entry.getName().endsWith(".xhtml")) {
+ content.parse(zip, childHandler, metadata, context);
+ }
+ entry = zip.getNextEntry();
+ }
+
+ // Finish everything
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index dcc705e..c9acbeb 100644
--- a/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parser-modules/tika-parser-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -1,58 +1,58 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.tika.TikaTest.assertContains;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class EpubParserTest {
-
- @Test
- public void testXMLParser() throws Exception {
- try (InputStream input = EpubParserTest.class.getResourceAsStream(
- "/test-documents/testEPUB.epub")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new EpubParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals("application/epub+zip",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("en",
- metadata.get(TikaCoreProperties.LANGUAGE));
- assertEquals("This is an ePub test publication for Tika.",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Apache",
- metadata.get(TikaCoreProperties.PUBLISHER));
-
- String content = handler.toString();
- assertContains("Plus a simple div", content);
- assertContains("First item", content);
- assertContains("The previous headings were subchapters", content);
- assertContains("Table data", content);
- }
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EpubParserTest {
+
+ @Test
+ public void testXMLParser() throws Exception {
+ try (InputStream input = EpubParserTest.class.getResourceAsStream(
+ "/test-documents/testEPUB.epub")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/epub+zip",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("en",
+ metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("This is an ePub test publication for Tika.",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Apache",
+ metadata.get(TikaCoreProperties.PUBLISHER));
+
+ String content = handler.toString();
+ assertContains("Plus a simple div", content);
+ assertContains("First item", content);
+ assertContains("The previous headings were subchapters", content);
+ assertContains("Table data", content);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-journal-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-journal-module/pom.xml b/tika-parser-modules/tika-parser-journal-module/pom.xml
index 1a29605..c45c2a9 100644
--- a/tika-parser-modules/tika-parser-journal-module/pom.xml
+++ b/tika-parser-modules/tika-parser-journal-module/pom.xml
@@ -1,68 +1,68 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-journal-module</artifactId>
- <name>Apache Tika parser journal module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <cxf.version>3.0.3</cxf.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.cxf</groupId>
- <artifactId>cxf-rt-rs-client</artifactId>
- <version>${cxf.version}</version>
- </dependency>
- <dependency>
- <groupId>org.json</groupId>
- <artifactId>json</artifactId>
- <version>20140107</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-pdf-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-journal-module</artifactId>
+ <name>Apache Tika parser journal module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <cxf.version>3.0.3</cxf.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-rs-client</artifactId>
+ <version>${cxf.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ <version>20140107</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pdf-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java b/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
index 2f9c36a..dda3e3a 100644
--- a/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-journal-module/src/main/java/org/apache/tika/module/journal/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.journal.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.journal.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 7a3a704..74cb504 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -1,101 +1,101 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-multimedia-module</artifactId>
- <name>Apache Tika parser multimedia module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <metadata.extractor.version>2.8.1</metadata.extractor.version>
- <isoparser.version>1.1.18</isoparser.version>
- <commons.logging.version>1.1.3</commons.logging.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-xmp-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>com.drewnoakes</groupId>
- <artifactId>metadata-extractor</artifactId>
- <version>${metadata.extractor.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>${codec.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-exec</artifactId>
- <version>${commons.exec}</version>
- </dependency>
- <dependency>
- <groupId>com.googlecode.mp4parser</groupId>
- <artifactId>isoparser</artifactId>
- <version>${isoparser.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>fontbox</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-web-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-multimedia-module</artifactId>
+ <name>Apache Tika parser multimedia module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <metadata.extractor.version>2.8.1</metadata.extractor.version>
+ <isoparser.version>1.1.18</isoparser.version>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.drewnoakes</groupId>
+ <artifactId>metadata-extractor</artifactId>
+ <version>${metadata.extractor.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-exec</artifactId>
+ <version>${commons.exec}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.mp4parser</groupId>
+ <artifactId>isoparser</artifactId>
+ <version>${isoparser.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>fontbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-web-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
index 7f53312..de4ae01 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/module/multimedia/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.multimedia.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.multimedia.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
[17/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 196ffa9..4ea3fa1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -1,412 +1,412 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ExcelParserTest extends TikaTest {
- @Test
- @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
- public void testExcelParser() throws Exception {
-
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
-
- // Mon Oct 01 17:13:56 BST 2007
- assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
-
- // Mon Oct 01 17:31:43 BST 2007
- assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
-
- String content = r.xml;
- assertContains("Sample Excel Worksheet", content);
- assertContains("Numbers and their Squares", content);
- assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
- assertContains("9", content);
- assertNotContained("9.0", content);
- assertContains("196", content);
- assertNotContained("196.0", content);
-
- }
-
- @Test
- public void testExcelParserFormatting() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
-
- // Number #,##0.00
- assertContains("1,599.99", content);
- assertContains("-1,599.99", content);
-
- // Currency $#,##0.00;[Red]($#,##0.00)
- assertContains("$1,599.99", content);
- assertContains("($1,599.99)", content);
-
- // Scientific 0.00E+00
- // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
- assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
- assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
-
- // Percentage.
- assertContains("2.50%", content);
- // Excel rounds up to 3%, but that requires Java 1.6 or later
- if (System.getProperty("java.version").startsWith("1.5")) {
- assertContains("2%", content);
- } else {
- assertContains("3%", content);
- }
-
- // Time Format: h:mm
- assertContains("6:15", content);
- assertContains("18:15", content);
-
- // Date Format: d-mmm-yy
- assertContains("17-May-07", content);
-
- // Date Format: m/d/yy
- assertContains("10/3/09", content);
-
- // Date/Time Format: m/d/yy h:mm
- assertContains("1/19/08 4:35", content);
-
- // Fraction (2.5): # ?/?
- assertContains("2 1/2", content);
-
-
- // Below assertions represent outstanding formatting issues to be addressed
- // they are included to allow the issues to be progressed with the Apache POI
- // team - See TIKA-103.
-
- /*************************************************************************
- // Custom Number (0 "dollars and" .00 "cents")
- assertContains("19 dollars and .99 cents", content);
-
- // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
- assertContains("At 4:20 AM on Thursday May 17, 2007", content);
- **************************************************************************/
-
-
- }
-
- @Test
- public void testExcelParserPassword() throws Exception {
- try {
- XMLResult r = getXML("testEXCEL_protected_passtika.xls");
- fail("Document is encrypted, shouldn't parse");
- } catch (EncryptedDocumentException e) {
- // Good
- }
-
- // Try again, this time with the password
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- context.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "tika";
- }
- });
- XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
-
- String content = r.xml;
- assertContains("This is an Encrypted Excel spreadsheet", content);
- assertNotContained("9.0", content);
-
- }
-
- /**
- * TIKA-214 - Ensure we extract labels etc from Charts
- */
- @Test
- public void testExcelParserCharts() throws Exception {
-
- XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
-
- // The first sheet has a pie chart
- assertContains("charttabyodawg", content);
- assertContains("WhamPuff", content);
-
- // The second sheet has a bar chart and some text
- assertContains("Sheet1", content);
- assertContains("Test Excel Spreasheet", content);
- assertContains("foo", content);
- assertContains("bar", content);
- assertContains("fizzlepuff", content);
- assertContains("whyaxis", content);
- assertContains("eksaxis", content);
-
- // The third sheet has some text
- assertContains("Sheet2", content);
- assertContains("dingdong", content);
-
- }
-
- @Test
- public void testJXL() throws Exception {
-
- XMLResult r = getXML("jxl.xls", new OfficeParser());
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
- assertContains("Number Formats", r.xml);
-
- }
-
- @Test
- public void testWorksSpreadsheet70() throws Exception {
- assertContains("Microsoft Works",
- getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
- }
-
- /**
- * We don't currently support the .xlsb file format
- * (an OOXML container with binary blobs), but we
- * shouldn't break on these files either (TIKA-826)
- */
- @Test
- public void testExcelXLSB() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
-
- Metadata m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
- // Should be detected correctly
- MediaType type;
- try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
- }
-
- // OfficeParser won't handle it
- assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // AutoDetectParser doesn't break on it
- assertContains("<body />", getXML("testEXCEL.xlsb").xml);
-
- }
-
- /**
- * Excel 5 and 95 are older formats, and only get basic support
- */
- @Test
- public void testExcel95() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
- MediaType type;
- Metadata m;
-
- // First try detection of Excel 5
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
- try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- }
-
- // Now Excel 95
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
- try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- }
-
- // OfficeParser can handle it
- assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-
- // Parse the Excel 5 file
- m = new Metadata();
- try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
-
- // Sheet names
- assertContains("Feuil1", content);
- assertContains("Feuil3", content);
-
- // Text
- assertContains("Sample Excel", content);
- assertContains("Number", content);
-
- // Numbers
- assertContains("15", content);
- assertContains("225", content);
-
- // Metadata was also fetched
- assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
- }
-
- // Parse the Excel 95 file
- m = new Metadata();
- try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
-
- // Sheet name
- assertContains("Foglio1", content);
-
- // Very boring file, no actual text or numbers!
-
- // Metadata was also fetched
- assertEquals(null, m.get(TikaCoreProperties.TITLE));
- assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
- }
- }
-
- /**
- * Ensures that custom OLE2 (HPSF) properties are extracted
- */
- @Test
- public void testCustomProperties() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
-
- XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
- Metadata metadata = r.metadata;
- assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
- }
-
- @Test
- public void testHeaderAndFooterExtraction() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.UK);
-
- XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
- new Metadata(), context);
-
- Metadata metadata = r.metadata;
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
-
- String content = r.xml;
- assertContains("John Smith1", content);
- assertContains("John Smith50", content);
- assertContains("1 Corporate HQ", content);
- assertContains("Header - Corporate Spreadsheet", content);
- assertContains("Header - For Internal Use Only", content);
- assertContains("Header - Author: John Smith", content);
- assertContains("Footer - Corporate Spreadsheet", content);
- assertContains("Footer - For Internal Use Only", content);
- assertContains("Footer - Author: John Smith", content);
-
- }
-
- @Test
- public void testHyperlinksInXLS() throws Exception {
- String xml = getXML("testEXCEL_hyperlinks.xls").xml;
- //external url
- assertContains("<a href=\"http://tika.apache.org/\">", xml);
- //mail url
- assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
- //external linked file
- assertContains("<a href=\"linked_file.txt.htm\">", xml);
-
- //TODO: not extracting these yet
- //link on textbox
-// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
- }
-
- @Test
- public void testEmbeddedPDF() throws Exception {
- List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
- assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExcelParserTest extends TikaTest {
+ @Test
+ @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
+ public void testExcelParser() throws Exception {
+
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
+
+ // Mon Oct 01 17:13:56 BST 2007
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
+
+ // Mon Oct 01 17:31:43 BST 2007
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
+
+ String content = r.xml;
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("Numbers and their Squares", content);
+ assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
+ assertContains("9", content);
+ assertNotContained("9.0", content);
+ assertContains("196", content);
+ assertNotContained("196.0", content);
+
+ }
+
+ @Test
+ public void testExcelParserFormatting() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ // Number #,##0.00
+ assertContains("1,599.99", content);
+ assertContains("-1,599.99", content);
+
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertContains("$1,599.99", content);
+ assertContains("($1,599.99)", content);
+
+ // Scientific 0.00E+00
+ // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
+ assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
+ assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
+
+ // Percentage.
+ assertContains("2.50%", content);
+ // Excel rounds up to 3%, but that requires Java 1.6 or later
+ if (System.getProperty("java.version").startsWith("1.5")) {
+ assertContains("2%", content);
+ } else {
+ assertContains("3%", content);
+ }
+
+ // Time Format: h:mm
+ assertContains("6:15", content);
+ assertContains("18:15", content);
+
+ // Date Format: d-mmm-yy
+ assertContains("17-May-07", content);
+
+ // Date Format: m/d/yy
+ assertContains("10/3/09", content);
+
+ // Date/Time Format: m/d/yy h:mm
+ assertContains("1/19/08 4:35", content);
+
+ // Fraction (2.5): # ?/?
+ assertContains("2 1/2", content);
+
+
+ // Below assertions represent outstanding formatting issues to be addressed
+ // they are included to allow the issues to be progressed with the Apache POI
+ // team - See TIKA-103.
+
+ /*************************************************************************
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertContains("19 dollars and .99 cents", content);
+
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+ **************************************************************************/
+
+
+ }
+
+ @Test
+ public void testExcelParserPassword() throws Exception {
+ try {
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls");
+ fail("Document is encrypted, shouldn't parse");
+ } catch (EncryptedDocumentException e) {
+ // Good
+ }
+
+ // Try again, this time with the password
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "tika";
+ }
+ });
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+ String content = r.xml;
+ assertContains("This is an Encrypted Excel spreadsheet", content);
+ assertNotContained("9.0", content);
+
+ }
+
+ /**
+ * TIKA-214 - Ensure we extract labels etc from Charts
+ */
+ @Test
+ public void testExcelParserCharts() throws Exception {
+
+ XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ // The first sheet has a pie chart
+ assertContains("charttabyodawg", content);
+ assertContains("WhamPuff", content);
+
+ // The second sheet has a bar chart and some text
+ assertContains("Sheet1", content);
+ assertContains("Test Excel Spreasheet", content);
+ assertContains("foo", content);
+ assertContains("bar", content);
+ assertContains("fizzlepuff", content);
+ assertContains("whyaxis", content);
+ assertContains("eksaxis", content);
+
+ // The third sheet has some text
+ assertContains("Sheet2", content);
+ assertContains("dingdong", content);
+
+ }
+
+ @Test
+ public void testJXL() throws Exception {
+
+ XMLResult r = getXML("jxl.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Number Formats", r.xml);
+
+ }
+
+ @Test
+ public void testWorksSpreadsheet70() throws Exception {
+ assertContains("Microsoft Works",
+ getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
+ }
+
+ /**
+ * We don't currently support the .xlsb file format
+ * (an OOXML container with binary blobs), but we
+ * shouldn't break on these files either (TIKA-826)
+ */
+ @Test
+ public void testExcelXLSB() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+
+ Metadata m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+ // Should be detected correctly
+ MediaType type;
+ try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+ }
+
+ // OfficeParser won't handle it
+ assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // AutoDetectParser doesn't break on it
+ assertContains("<body />", getXML("testEXCEL.xlsb").xml);
+
+ }
+
+ /**
+ * Excel 5 and 95 are older formats, and only get basic support
+ */
+ @Test
+ public void testExcel95() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+ MediaType type;
+ Metadata m;
+
+ // First try detection of Excel 5
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ }
+
+ // Now Excel 95
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ }
+
+ // OfficeParser can handle it
+ assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+
+ // Parse the Excel 5 file
+ m = new Metadata();
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet names
+ assertContains("Feuil1", content);
+ assertContains("Feuil3", content);
+
+ // Text
+ assertContains("Sample Excel", content);
+ assertContains("Number", content);
+
+ // Numbers
+ assertContains("15", content);
+ assertContains("225", content);
+
+ // Metadata was also fetched
+ assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
+ }
+
+ // Parse the Excel 95 file
+ m = new Metadata();
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet name
+ assertContains("Foglio1", content);
+
+ // Very boring file, no actual text or numbers!
+
+ // Metadata was also fetched
+ assertEquals(null, m.get(TikaCoreProperties.TITLE));
+ assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
+ }
+ }
+
+ /**
+ * Ensures that custom OLE2 (HPSF) properties are extracted
+ */
+ @Test
+ public void testCustomProperties() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+
+ XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+ Metadata metadata = r.metadata;
+ assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ }
+
+ @Test
+ public void testHeaderAndFooterExtraction() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
+
+ XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
+ new Metadata(), context);
+
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+ String content = r.xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertContains("Header - Corporate Spreadsheet", content);
+ assertContains("Header - For Internal Use Only", content);
+ assertContains("Header - Author: John Smith", content);
+ assertContains("Footer - Corporate Spreadsheet", content);
+ assertContains("Footer - For Internal Use Only", content);
+ assertContains("Footer - Author: John Smith", content);
+
+ }
+
+ @Test
+ public void testHyperlinksInXLS() throws Exception {
+ String xml = getXML("testEXCEL_hyperlinks.xls").xml;
+ //external url
+ assertContains("<a href=\"http://tika.apache.org/\">", xml);
+ //mail url
+ assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
+ //external linked file
+ assertContains("<a href=\"linked_file.txt.htm\">", xml);
+
+ //TODO: not extracting these yet
+ //link on textbox
+// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
+ }
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 07644dd..beffee6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
-import org.junit.Test;
-
-
-public class OfficeParserTest extends TikaTest {
-
- @Test
- public void parseOfficeWord() throws Exception {
- Metadata metadata = new Metadata();
- Parser parser = new OfficeParser();
-
- String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
-
- assertTrue(xml.contains("test"));
- }
-
- private InputStream getTestDocument(String name) {
- return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+import org.junit.Test;
+
+
+public class OfficeParserTest extends TikaTest {
+
+ @Test
+ public void parseOfficeWord() throws Exception {
+ Metadata metadata = new Metadata();
+ Parser parser = new OfficeParser();
+
+ String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
+
+ assertTrue(xml.contains("test"));
+ }
+
+ private InputStream getTestDocument(String name) {
+ return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index fbf8114..8662e65 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -1,239 +1,239 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing Outlook files.
- */
-public class OutlookParserTest extends TikaTest {
-
- @Test
- public void testOutlookParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(
- "Microsoft Outlook Express 6",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals(
- "Nouvel utilisateur de Outlook Express",
- metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
- assertEquals(
- "L'\u00C9quipe Microsoft Outlook Express",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals(
- "L'\u00C9quipe Microsoft Outlook Express",
- metadata.get(Metadata.AUTHOR));
-
- // Stored as Thu, 5 Apr 2007 09:26:06 -0700
- assertEquals(
- "2007-04-05T16:26:06Z",
- metadata.get(TikaCoreProperties.CREATED));
-
- String content = handler.toString();
- assertContains("Microsoft Outlook Express 6", content);
- assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
- assertContains("Nouvel utilisateur de Outlook Express", content);
- assertContains("Messagerie et groupes de discussion", content);
- }
-
- /**
- * Test case for TIKA-197
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
- */
- @Test
- public void testMultipleCopies() throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
-
- String content = handler.toString();
- Pattern pattern = Pattern.compile("From");
- Matcher matcher = pattern.matcher(content);
- assertTrue(matcher.find());
- assertFalse(matcher.find());
- }
-
- /**
- * Test case for TIKA-395, to ensure parser works for new Outlook formats.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
- */
- @Test
- public void testOutlookNew() throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook2003.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(
- "Welcome to Microsoft Office Outlook 2003",
- metadata.get(TikaCoreProperties.TITLE));
-
- String content = handler.toString();
- assertContains("Outlook 2003", content);
- assertContains("Streamlined Mail Experience", content);
- assertContains("Navigation Pane", content);
- }
-
- @Test
- public void testOutlookHTMLVersion() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_chinese.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // As the HTML version should have been processed, ensure
- // we got some of the links
- String content = sw.toString();
- assertContains("<dd>tests.chang@fengttt.com</dd>", content);
- assertContains("<p>Alfresco MSG format testing", content);
- assertContains("<li>1", content);
- assertContains("<li>2", content);
-
- // Make sure we don't have nested html docs
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
-
- // Make sure that the Chinese actually came through
- assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
- assertContains("\u9673\u60E0\u73CD", content);
- }
-
- @Test
- public void testOutlookForwarded() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_forwarded.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // Make sure we don't have nested docs
- String content = sw.toString();
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
- }
-
- @Test
- public void testOutlookHTMLfromRTF() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook2003.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // As the HTML version should have been processed, ensure
- // we got some of the links
- String content = sw.toString().replaceAll("<p>\\s+", "<p>");
- assertContains("<dd>New Outlook User</dd>", content);
- assertContains("designed <i>to help you", content);
- assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
-
- // Link - check text around it, and the link itself
- assertContains("sign up for a free subscription", content);
- assertContains("Office Newsletter", content);
- assertContains("newsletter will be sent to you", content);
- assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
-
- // Make sure we don't have nested html docs
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TikaTest {
+
+ @Test
+ public void testOutlookParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Microsoft Outlook Express 6",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(
+ "Nouvel utilisateur de Outlook Express",
+ metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+ assertEquals(
+ "L'\u00C9quipe Microsoft Outlook Express",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals(
+ "L'\u00C9quipe Microsoft Outlook Express",
+ metadata.get(Metadata.AUTHOR));
+
+ // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+ assertEquals(
+ "2007-04-05T16:26:06Z",
+ metadata.get(TikaCoreProperties.CREATED));
+
+ String content = handler.toString();
+ assertContains("Microsoft Outlook Express 6", content);
+ assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
+ assertContains("Nouvel utilisateur de Outlook Express", content);
+ assertContains("Messagerie et groupes de discussion", content);
+ }
+
+ /**
+ * Test case for TIKA-197
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+ */
+ @Test
+ public void testMultipleCopies() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ Pattern pattern = Pattern.compile("From");
+ Matcher matcher = pattern.matcher(content);
+ assertTrue(matcher.find());
+ assertFalse(matcher.find());
+ }
+
+ /**
+ * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
+ */
+ @Test
+ public void testOutlookNew() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Welcome to Microsoft Office Outlook 2003",
+ metadata.get(TikaCoreProperties.TITLE));
+
+ String content = handler.toString();
+ assertContains("Outlook 2003", content);
+ assertContains("Streamlined Mail Experience", content);
+ assertContains("Navigation Pane", content);
+ }
+
+ @Test
+ public void testOutlookHTMLVersion() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG_chinese.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString();
+ assertContains("<dd>tests.chang@fengttt.com</dd>", content);
+ assertContains("<p>Alfresco MSG format testing", content);
+ assertContains("<li>1", content);
+ assertContains("<li>2", content);
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+
+ // Make sure that the Chinese actually came through
+ assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
+ assertContains("\u9673\u60E0\u73CD", content);
+ }
+
+ @Test
+ public void testOutlookForwarded() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG_forwarded.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Make sure we don't have nested docs
+ String content = sw.toString();
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+ }
+
+ @Test
+ public void testOutlookHTMLfromRTF() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString().replaceAll("<p>\\s+", "<p>");
+ assertContains("<dd>New Outlook User</dd>", content);
+ assertContains("designed <i>to help you", content);
+ assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
+
+ // Link - check text around it, and the link itself
+ assertContains("sign up for a free subscription", content);
+ assertContains("Office Newsletter", content);
+ assertContains("newsletter will be sent to you", content);
+ assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+ }
+}
[09/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 77773e0..f9df9e0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,544 +1,544 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-
-
-/**
- * <code>CharsetDetector</code> provides a facility for detecting the
- * charset or encoding of character data in an unknown format.
- * The input data can either be from an input stream or an array of bytes.
- * The result of the detection operation is a list of possibly matching
- * charsets, or, for simple use, you can just ask for a Java Reader that
- * will will work over the input data.
- * <p/>
- * Character set detection is at best an imprecise operation. The detection
- * process will attempt to identify the charset that best matches the characteristics
- * of the byte data, but the process is partly statistical in nature, and
- * the results can not be guaranteed to always be correct.
- * <p/>
- * For best accuracy in charset detection, the input data should be primarily
- * in a single language, and a minimum of a few hundred bytes worth of plain text
- * in the language are needed. The detection process will attempt to
- * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
- * @stable ICU 3.4
- */
-public class CharsetDetector {
-
-// Question: Should we have getters corresponding to the setters for input text
-// and declared encoding?
-
-// A thought: If we were to create our own type of Java Reader, we could defer
-// figuring out an actual charset for data that starts out with too much English
-// only ASCII until the user actually read through to something that didn't look
-// like 7 bit English. If nothing else ever appeared, we would never need to
-// actually choose the "real" charset. All assuming that the application just
-// wants the data, and doesn't care about a char set name.
-
- private static final int kBufSize = 12000;
- private static final int MAX_CONFIDENCE = 100;
- private static String[] fCharsetNames;
- /*
- * List of recognizers for all charsets known to the implementation.
- */
- private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
- /*
- * The following items are accessed by individual CharsetRecongizers during
- * the recognition process
- *
- */
- byte[] fInputBytes = // The text to be checked. Markup will have been
- new byte[kBufSize]; // removed if appropriate.
- int fInputLen; // Length of the byte data in fInputText.
- short fByteStats[] = // byte frequency statistics for the input text.
- new short[256]; // Value is percent, not absolute.
- boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
- false;
- String fDeclaredEncoding;
- //
- // Stuff private to CharsetDetector
- //
- byte[] fRawInput; // Original, untouched input bytes.
- // If user gave us a byte array, this is it.
- // If user gave us a stream, it's read to a
- // buffer here.
- int fRawLength; // Length of data in fRawInput array.
- InputStream fInputStream; // User's input stream, or null if the user
- boolean fStripTags = // If true, setText() will strip tags from input text.
- false;
-
- /**
- * Constructor
- *
- * @stable ICU 3.4
- */
- public CharsetDetector() {
- }
-
- /**
- * Get the names of all char sets that can be recognized by the char set detector.
- *
- * @return an array of the names of all charsets that can be recognized
- * by the charset detector.
- *
- * @stable ICU 3.4
- */
- public static String[] getAllDetectableCharsets() {
- return fCharsetNames;
- }
-
- /*
- * Create the singleton instances of the CharsetRecognizer classes
- */
- private static ArrayList<CharsetRecognizer> createRecognizers() {
- ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
- recognizers.add(new CharsetRecog_UTF8());
-
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
- // Create an array of all charset names, as a side effect.
- // Needed for the getAllDetectableCharsets() API.
- String[] charsetNames = new String[recognizers.size()];
- int out = 0;
-
- for (CharsetRecognizer recognizer : recognizers) {
- String name = recognizer.getName();
-
- if (out == 0 || !name.equals(charsetNames[out - 1])) {
- charsetNames[out++] = name;
- }
- }
-
- fCharsetNames = new String[out];
- System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
- return recognizers;
- }
-
- /**
- * Set the declared encoding for charset detection.
- * The declared encoding of an input text is an encoding obtained
- * from an http header or xml declaration or similar source that
- * can be provided as additional information to the charset detector.
- * A match between a declared encoding and a possible detected encoding
- * will raise the quality of that detected encoding by a small delta,
- * and will also appear as a "reason" for the match.
- * <p/>
- * A declared encoding that is incompatible with the input data being
- * analyzed will not be added to the list of possible encodings.
- *
- * @param encoding The declared encoding
- *
- * @stable ICU 3.4
- */
- public CharsetDetector setDeclaredEncoding(String encoding) {
- setCanonicalDeclaredEncoding(encoding);
- return this;
- }
-
- /**
- * Set the input text (byte) data whose charset is to be detected.
- *
- * @param in the input text of unknown encoding
- *
- * @return This CharsetDetector
- *
- * @stable ICU 3.4
- */
- public CharsetDetector setText(byte[] in) {
- fRawInput = in;
- fRawLength = in.length;
-
- MungeInput();
-
- return this;
- }
- // Value is rounded up, so zero really means zero occurences.
-
- /**
- * Set the input text (byte) data whose charset is to be detected.
- * <p/>
- * The input stream that supplies the character data must have markSupported()
- * == true; the charset detection process will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *
- * @param in the input text of unknown encoding
- *
- * @return This CharsetDetector
- *
- * @stable ICU 3.4
- */
-
- public CharsetDetector setText(InputStream in) throws IOException {
- fInputStream = in;
- fInputStream.mark(kBufSize);
- fRawInput = new byte[kBufSize]; // Always make a new buffer because the
- // previous one may have come from the caller,
- // in which case we can't touch it.
- fRawLength = 0;
- int remainingLength = kBufSize;
- while (remainingLength > 0) {
- // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
- int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
- if (bytesRead <= 0) {
- break;
- }
- fRawLength += bytesRead;
- remainingLength -= bytesRead;
- }
- fInputStream.reset();
-
- MungeInput(); // Strip html markup, collect byte stats.
- return this;
- }
-
- /**
- * Return the charset that best matches the supplied input data.
- *
- * Note though, that because the detection
- * only looks at the start of the input data,
- * there is a possibility that the returned charset will fail to handle
- * the full set of input data.
- * <p/>
- * Raise an exception if
- * <ul>
- * <li>no charset appears to match the data.</li>
- * <li>no input text has been provided</li>
- * </ul>
- *
- * @return a CharsetMatch object representing the best matching charset, or
- * <code>null</code> if there are no matches.
- *
- * @stable ICU 3.4
- */
- public CharsetMatch detect() {
-// TODO: A better implementation would be to copy the detect loop from
-// detectAll(), and cut it short as soon as a match with a high confidence
-// is found. This is something to be done later, after things are otherwise
-// working.
- CharsetMatch matches[] = detectAll();
-
- if (matches == null || matches.length == 0) {
- return null;
- }
-
- return matches[0];
- }
-
- /**
- * Return an array of all charsets that appear to be plausible
- * matches with the input data. The array is ordered with the
- * best quality match first.
- * <p/>
- * Raise an exception if
- * <ul>
- * <li>no charsets appear to match the input data.</li>
- * <li>no input text has been provided</li>
- * </ul>
- *
- * @return An array of CharsetMatch objects representing possibly matching charsets.
- *
- * @stable ICU 3.4
- */
- public CharsetMatch[] detectAll() {
- CharsetRecognizer csr;
- int i;
- int detectResults;
- int confidence;
- ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
- // Iterate over all possible charsets, remember all that
- // give a match quality > 0.
- for (i = 0; i < fCSRecognizers.size(); i++) {
- csr = fCSRecognizers.get(i);
- detectResults = csr.match(this);
- confidence = detectResults & 0x000000ff;
- if (confidence > 0) {
- // Just to be safe, constrain
- confidence = Math.min(confidence, MAX_CONFIDENCE);
-
- // Apply charset hint.
- if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
- // Reduce lack of confidence (delta between "sure" and current) by 50%.
- confidence += (MAX_CONFIDENCE - confidence) / 2;
- }
-
- CharsetMatch m = new CharsetMatch(this, csr, confidence);
- matches.add(m);
- }
- }
-
- Collections.sort(matches); // CharsetMatch compares on confidence
- Collections.reverse(matches); // Put best match first.
- CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
- resultArray = matches.toArray(resultArray);
- return resultArray;
- }
-
- /**
- * Autodetect the charset of an inputStream, and return a Java Reader
- * to access the converted input data.
- * <p/>
- * This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
- * <p/>
- * For the input stream that supplies the character data, markSupported()
- * must be true; the charset detection will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *<p/>
- * Raise an exception if no charsets appear to match the input data.
- *
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
- * @stable ICU 3.4
- */
- public Reader getReader(InputStream in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
-
- try {
- setText(in);
-
- CharsetMatch match = detect();
-
- if (match == null) {
- return null;
- }
-
- return match.getReader();
- } catch (IOException e) {
- return null;
- }
- }
-
- /**
- * Autodetect the charset of an inputStream, and return a String
- * containing the converted input data.
- * <p/>
- * This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
- *<p/>
- * Raise an exception if no charsets appear to match the input data.
- *
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
- * @stable ICU 3.4
- */
- public String getString(byte[] in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
-
- try {
- setText(in);
-
- CharsetMatch match = detect();
-
- if (match == null) {
- return null;
- }
-
- return match.getString(-1);
- } catch (IOException e) {
- return null;
- }
- }
- // gave us a byte array.
-
- /**
- * Test whether or not input filtering is enabled.
- *
- * @return <code>true</code> if input text will be filtered.
- *
- * @see #enableInputFilter
- *
- * @stable ICU 3.4
- */
- public boolean inputFilterEnabled() {
- return fStripTags;
- }
-
- /**
- * Enable filtering of input text. If filtering is enabled,
- * text within angle brackets ("<" and ">") will be removed
- * before detection.
- *
- * @param filter <code>true</code> to enable input text filtering.
- *
- * @return The previous setting.
- *
- * @stable ICU 3.4
- */
- public boolean enableInputFilter(boolean filter) {
- boolean previous = fStripTags;
-
- fStripTags = filter;
-
- return previous;
- }
-
- /**
- * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
- *
- * @param encoding - name of character encoding
- */
- private void setCanonicalDeclaredEncoding(String encoding) {
- if ((encoding == null) || encoding.isEmpty()) {
- return;
- }
-
- Charset cs = Charset.forName(encoding);
- if (cs != null) {
- fDeclaredEncoding = cs.name();
- }
- }
-
- /*
- * MungeInput - after getting a set of raw input data to be analyzed, preprocess
- * it by removing what appears to be html markup.
- */
- private void MungeInput() {
- int srci = 0;
- int dsti = 0;
- byte b;
- boolean inMarkup = false;
- int openTags = 0;
- int badTags = 0;
-
- //
- // html / xml markup stripping.
- // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
- // discard everything within < brackets >
- // Count how many total '<' and illegal (nested) '<' occur, so we can make some
- // guess as to whether the input was actually marked up at all.
- if (fStripTags) {
- for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
- b = fRawInput[srci];
- if (b == (byte) '<') {
- if (inMarkup) {
- badTags++;
- }
- inMarkup = true;
- openTags++;
- }
-
- if (!inMarkup) {
- fInputBytes[dsti++] = b;
- }
-
- if (b == (byte) '>') {
- inMarkup = false;
- }
- }
-
- fInputLen = dsti;
- }
-
- //
- // If it looks like this input wasn't marked up, or if it looks like it's
- // essentially nothing but markup abandon the markup stripping.
- // Detection will have to work on the unstripped input.
- //
- if (openTags < 5 || openTags / 5 < badTags ||
- (fInputLen < 100 && fRawLength > 600)) {
- int limit = fRawLength;
-
- if (limit > kBufSize) {
- limit = kBufSize;
- }
-
- for (srci = 0; srci < limit; srci++) {
- fInputBytes[srci] = fRawInput[srci];
- }
- fInputLen = srci;
- }
-
- //
- // Tally up the byte occurence statistics.
- // These are available for use by the various detectors.
- //
- Arrays.fill(fByteStats, (short) 0);
- for (srci = 0; srci < fInputLen; srci++) {
- int val = fInputBytes[srci] & 0x00ff;
- fByteStats[val]++;
- }
-
- fC1Bytes = false;
- for (int i = 0x80; i <= 0x9F; i += 1) {
- if (fByteStats[i] != 0) {
- fC1Bytes = true;
- break;
- }
- }
- }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation. The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed. The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+// Question: Should we have getters corresponding to the setters for input text
+// and declared encoding?
+
+// A thought: If we were to create our own type of Java Reader, we could defer
+// figuring out an actual charset for data that starts out with too much English
+// only ASCII until the user actually read through to something that didn't look
+// like 7 bit English. If nothing else ever appeared, we would never need to
+// actually choose the "real" charset. All assuming that the application just
+// wants the data, and doesn't care about a char set name.
+
+ private static final int kBufSize = 12000;
+ private static final int MAX_CONFIDENCE = 100;
+ private static String[] fCharsetNames;
+ /*
+ * List of recognizers for all charsets known to the implementation.
+ */
+ private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+ /*
+ * The following items are accessed by individual CharsetRecongizers during
+ * the recognition process
+ *
+ */
+ byte[] fInputBytes = // The text to be checked. Markup will have been
+ new byte[kBufSize]; // removed if appropriate.
+ int fInputLen; // Length of the byte data in fInputText.
+ short fByteStats[] = // byte frequency statistics for the input text.
+ new short[256]; // Value is percent, not absolute.
+ boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
+ false;
+ String fDeclaredEncoding;
+ //
+ // Stuff private to CharsetDetector
+ //
+ byte[] fRawInput; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ // If user gave us a stream, it's read to a
+ // buffer here.
+ int fRawLength; // Length of data in fRawInput array.
+ InputStream fInputStream; // User's input stream, or null if the user
+ boolean fStripTags = // If true, setText() will strip tags from input text.
+ false;
+
+ /**
+ * Constructor
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector() {
+ }
+
+ /**
+ * Get the names of all char sets that can be recognized by the char set detector.
+ *
+ * @return an array of the names of all charsets that can be recognized
+ * by the charset detector.
+ *
+ * @stable ICU 3.4
+ */
+ public static String[] getAllDetectableCharsets() {
+ return fCharsetNames;
+ }
+
+ /*
+ * Create the singleton instances of the CharsetRecognizer classes
+ */
+ private static ArrayList<CharsetRecognizer> createRecognizers() {
+ ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
+
+ recognizers.add(new CharsetRecog_UTF8());
+
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+ // Create an array of all charset names, as a side effect.
+ // Needed for the getAllDetectableCharsets() API.
+ String[] charsetNames = new String[recognizers.size()];
+ int out = 0;
+
+ for (CharsetRecognizer recognizer : recognizers) {
+ String name = recognizer.getName();
+
+ if (out == 0 || !name.equals(charsetNames[out - 1])) {
+ charsetNames[out++] = name;
+ }
+ }
+
+ fCharsetNames = new String[out];
+ System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+ return recognizers;
+ }
+
+ /**
+ * Set the declared encoding for charset detection.
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p/>
+ * A declared encoding that is incompatible with the input data being
+ * analyzed will not be added to the list of possible encodings.
+ *
+ * @param encoding The declared encoding
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setDeclaredEncoding(String encoding) {
+ setCanonicalDeclaredEncoding(encoding);
+ return this;
+ }
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setText(byte[] in) {
+ fRawInput = in;
+ fRawLength = in.length;
+
+ MungeInput();
+
+ return this;
+ }
+ // Value is rounded up, so zero really means zero occurences.
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ * <p/>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+
+ public CharsetDetector setText(InputStream in) throws IOException {
+ fInputStream = in;
+ fInputStream.mark(kBufSize);
+ fRawInput = new byte[kBufSize]; // Always make a new buffer because the
+ // previous one may have come from the caller,
+ // in which case we can't touch it.
+ fRawLength = 0;
+ int remainingLength = kBufSize;
+ while (remainingLength > 0) {
+ // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
+ int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+ if (bytesRead <= 0) {
+ break;
+ }
+ fRawLength += bytesRead;
+ remainingLength -= bytesRead;
+ }
+ fInputStream.reset();
+
+ MungeInput(); // Strip html markup, collect byte stats.
+ return this;
+ }
+
+ /**
+ * Return the charset that best matches the supplied input data.
+ *
+ * Note though, that because the detection
+ * only looks at the start of the input data,
+ * there is a possibility that the returned charset will fail to handle
+ * the full set of input data.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return a CharsetMatch object representing the best matching charset, or
+ * <code>null</code> if there are no matches.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch detect() {
+// TODO: A better implementation would be to copy the detect loop from
+// detectAll(), and cut it short as soon as a match with a high confidence
+// is found. This is something to be done later, after things are otherwise
+// working.
+ CharsetMatch matches[] = detectAll();
+
+ if (matches == null || matches.length == 0) {
+ return null;
+ }
+
+ return matches[0];
+ }
+
+ /**
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return An array of CharsetMatch objects representing possibly matching charsets.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch[] detectAll() {
+ CharsetRecognizer csr;
+ int i;
+ int detectResults;
+ int confidence;
+ ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+ // Iterate over all possible charsets, remember all that
+ // give a match quality > 0.
+ for (i = 0; i < fCSRecognizers.size(); i++) {
+ csr = fCSRecognizers.get(i);
+ detectResults = csr.match(this);
+ confidence = detectResults & 0x000000ff;
+ if (confidence > 0) {
+ // Just to be safe, constrain
+ confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+ // Apply charset hint.
+ if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+ // Reduce lack of confidence (delta between "sure" and current) by 50%.
+ confidence += (MAX_CONFIDENCE - confidence) / 2;
+ }
+
+ CharsetMatch m = new CharsetMatch(this, csr, confidence);
+ matches.add(m);
+ }
+ }
+
+ Collections.sort(matches); // CharsetMatch compares on confidence
+ Collections.reverse(matches); // Put best match first.
+ CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+ resultArray = matches.toArray(resultArray);
+ return resultArray;
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a Java Reader
+ * to access the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p/>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public Reader getReader(InputStream in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getReader();
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a String
+ * containing the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString(byte[] in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getString(-1);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+ // gave us a byte array.
+
+ /**
+ * Test whether or not input filtering is enabled.
+ *
+ * @return <code>true</code> if input text will be filtered.
+ *
+ * @see #enableInputFilter
+ *
+ * @stable ICU 3.4
+ */
+ public boolean inputFilterEnabled() {
+ return fStripTags;
+ }
+
+ /**
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection.
+ *
+ * @param filter <code>true</code> to enable input text filtering.
+ *
+ * @return The previous setting.
+ *
+ * @stable ICU 3.4
+ */
+ public boolean enableInputFilter(boolean filter) {
+ boolean previous = fStripTags;
+
+ fStripTags = filter;
+
+ return previous;
+ }
+
+ /**
+ * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
+ *
+ * @param encoding - name of character encoding
+ */
+ private void setCanonicalDeclaredEncoding(String encoding) {
+ if ((encoding == null) || encoding.isEmpty()) {
+ return;
+ }
+
+ Charset cs = Charset.forName(encoding);
+ if (cs != null) {
+ fDeclaredEncoding = cs.name();
+ }
+ }
+
+ /*
+ * MungeInput - after getting a set of raw input data to be analyzed, preprocess
+ * it by removing what appears to be html markup.
+ */
+ private void MungeInput() {
+ int srci = 0;
+ int dsti = 0;
+ byte b;
+ boolean inMarkup = false;
+ int openTags = 0;
+ int badTags = 0;
+
+ //
+ // html / xml markup stripping.
+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+ // discard everything within < brackets >
+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some
+ // guess as to whether the input was actually marked up at all.
+ if (fStripTags) {
+ for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
+ b = fRawInput[srci];
+ if (b == (byte) '<') {
+ if (inMarkup) {
+ badTags++;
+ }
+ inMarkup = true;
+ openTags++;
+ }
+
+ if (!inMarkup) {
+ fInputBytes[dsti++] = b;
+ }
+
+ if (b == (byte) '>') {
+ inMarkup = false;
+ }
+ }
+
+ fInputLen = dsti;
+ }
+
+ //
+ // If it looks like this input wasn't marked up, or if it looks like it's
+ // essentially nothing but markup abandon the markup stripping.
+ // Detection will have to work on the unstripped input.
+ //
+ if (openTags < 5 || openTags / 5 < badTags ||
+ (fInputLen < 100 && fRawLength > 600)) {
+ int limit = fRawLength;
+
+ if (limit > kBufSize) {
+ limit = kBufSize;
+ }
+
+ for (srci = 0; srci < limit; srci++) {
+ fInputBytes[srci] = fRawInput[srci];
+ }
+ fInputLen = srci;
+ }
+
+ //
+ // Tally up the byte occurence statistics.
+ // These are available for use by the various detectors.
+ //
+ Arrays.fill(fByteStats, (short) 0);
+ for (srci = 0; srci < fInputLen; srci++) {
+ int val = fInputBytes[srci] & 0x00ff;
+ fByteStats[val]++;
+ }
+
+ fC1Bytes = false;
+ for (int i = 0x80; i <= 0x9F; i += 1) {
+ if (fByteStats[i] != 0) {
+ fC1Bytes = true;
+ break;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 9244cd9..22219ab 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,286 +1,286 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-
-
-/**
- * This class represents a charset that has been identified by a CharsetDetector
- * as a possible encoding for a set of input data. From an instance of this
- * class, you can ask for a confidence level in the charset identification,
- * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
- * Instances of this class are created only by CharsetDetectors.
- * <p/>
- * Note: this class has a natural ordering that is inconsistent with equals.
- * The natural ordering is based on the match confidence value.
- *
- * @stable ICU 3.4
- */
-public class CharsetMatch implements Comparable<CharsetMatch> {
-
-
- /**
- * Bit flag indicating the match is based on the the encoding scheme.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int ENCODING_SCHEME = 1;
- /**
- * Bit flag indicating the match is based on the presence of a BOM.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int BOM = 2;
- /**
- * Bit flag indicating he match is based on the declared encoding.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int DECLARED_ENCODING = 4;
- /**
- * Bit flag indicating the match is based on language statistics.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int LANG_STATISTICS = 8;
- //
- // Private Data
- //
- private int fConfidence;
- private CharsetRecognizer fRecognizer;
- private byte[] fRawInput = null; // Original, untouched input bytes.
- // If user gave us a byte array, this is it.
- private int fRawLength; // Length of data in fRawInput array.
- private InputStream fInputStream = null; // User's input stream, or null if the user
-
- /*
- * Constructor. Implementation internal
- */
- CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
- fRecognizer = rec;
- fConfidence = conf;
-
- // The references to the original aplication input data must be copied out
- // of the charset recognizer to here, in case the application resets the
- // recognizer before using this CharsetMatch.
- if (det.fInputStream == null) {
- // We only want the existing input byte data if it came straight from the user,
- // not if is just the head of a stream.
- fRawInput = det.fRawInput;
- fRawLength = det.fRawLength;
- }
- fInputStream = det.fInputStream;
- }
-
- /**
- * Create a java.io.Reader for reading the Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- * <p/>
- * CAUTION: if the source of the byte data was an InputStream, a Reader
- * can be created for only one matching char set using this method. If more
- * than one charset needs to be tried, the caller will need to reset
- * the InputStream and create InputStreamReaders itself, based on the charset name.
- *
- * @return the Reader for the Unicode character data.
- *
- * @stable ICU 3.4
- */
- public Reader getReader() {
- InputStream inputStream = fInputStream;
-
- if (inputStream == null) {
- inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
- }
-
- try {
- inputStream.reset();
- return new InputStreamReader(inputStream, getName());
- } catch (IOException e) {
- return null;
- }
- }
-
- /**
- * Create a Java String from Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- *
- * @return a String created from the converted input data.
- *
- * @stable ICU 3.4
- */
- public String getString() throws java.io.IOException {
- return getString(-1);
-
- }
-
- /**
- * Create a Java String from Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- * The length of the returned string is limited to the specified size;
- * the string will be trunctated to this length if necessary. A limit value of
- * zero or less is ignored, and treated as no limit.
- *
- * @param maxLength The maximium length of the String to be created when the
- * source of the data is an input stream, or -1 for
- * unlimited length.
- * @return a String created from the converted input data.
- *
- * @stable ICU 3.4
- */
- public String getString(int maxLength) throws java.io.IOException {
- String result = null;
- if (fInputStream != null) {
- StringBuffer sb = new StringBuffer();
- char[] buffer = new char[1024];
- Reader reader = getReader();
- int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
- int bytesRead = 0;
-
- while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
- sb.append(buffer, 0, bytesRead);
- max -= bytesRead;
- }
-
- reader.close();
-
- return sb.toString();
- } else {
- result = new String(fRawInput, getName());
- }
- return result;
-
- }
-
- /**
- * Get an indication of the confidence in the charset detected.
- * Confidence values range from 0-100, with larger numbers indicating
- * a better match of the input data to the characteristics of the
- * charset.
- *
- * @return the confidence in the charset match
- *
- * @stable ICU 3.4
- */
- public int getConfidence() {
- return fConfidence;
- }
-
- /**
- * Return flags indicating what it was about the input data
- * that caused this charset to be considered as a possible match.
- * The result is a bitfield containing zero or more of the flags
- * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
- * A result of zero means no information is available.
- * <p>
- * Note: currently, this method always returns zero.
- * <p>
- *
- * @return the type of match found for this charset.
- *
- * @draft ICU 3.4
- * @provisional This API might change or be removed in a future release.
- */
- public int getMatchType() {
-// TODO: create a list of enum-like constants for common combinations of types of matches.
- return 0;
- }
-
- /**
- * Get the name of the detected charset.
- * The name will be one that can be used with other APIs on the
- * platform that accept charset names. It is the "Canonical name"
- * as defined by the class java.nio.charset.Charset; for
- * charsets that are registered with the IANA charset registry,
- * this is the MIME-preferred registerd name.
- *
- * @see java.nio.charset.Charset
- * @see java.io.InputStreamReader
- *
- * @return The name of the charset.
- *
- * @stable ICU 3.4
- */
- public String getName() {
- return fRecognizer.getName();
- }
-
- /**
- * Get the ISO code for the language of the detected charset.
- *
- * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
- *
- * @stable ICU 3.4
- */
- public String getLanguage() {
- return fRecognizer.getLanguage();
- }
-
- /**
- * Compare to other CharsetMatch objects.
- * Comparison is based on the match confidence value, which
- * allows CharsetDetector.detectAll() to order its results.
- *
- * @param o the CharsetMatch object to compare against.
- * @return a negative integer, zero, or a positive integer as the
- * confidence level of this CharsetMatch
- * is less than, equal to, or greater than that of
- * the argument.
- * @throws ClassCastException if the argument is not a CharsetMatch.
- * @stable ICU 3.4
- */
- public int compareTo(CharsetMatch other) {
- int compareResult = 0;
- if (this.fConfidence > other.fConfidence) {
- compareResult = 1;
- } else if (this.fConfidence < other.fConfidence) {
- compareResult = -1;
- }
- return compareResult;
- }
-
- /**
- * compare this CharsetMatch to another based on confidence value
- * @param o the CharsetMatch object to compare against
- * @return true if equal
- */
- public boolean equals(Object o) {
- if (o instanceof CharsetMatch) {
- CharsetMatch that = (CharsetMatch) o;
- return (this.fConfidence == that.fConfidence);
- }
-
- return false;
- }
-
- /**
- * generates a hashCode based on the confidence value
- * @return the hashCode
- */
- public int hashCode() {
- return fConfidence;
- }
- // gave us a byte array.
-
- public String toString() {
- String s = "Match of " + fRecognizer.getName();
- if (fRecognizer.getLanguage() != null) {
- s += " in " + fRecognizer.getLanguage();
- }
- s += " with confidence " + fConfidence;
- return s;
- }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2007, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data. From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note: this class has a natural ordering that is inconsistent with equals.
+ * The natural ordering is based on the match confidence value.
+ *
+ * @stable ICU 3.4
+ */
+public class CharsetMatch implements Comparable<CharsetMatch> {
+
+
+ /**
+ * Bit flag indicating the match is based on the the encoding scheme.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int ENCODING_SCHEME = 1;
+ /**
+ * Bit flag indicating the match is based on the presence of a BOM.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int BOM = 2;
+ /**
+ * Bit flag indicating he match is based on the declared encoding.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int DECLARED_ENCODING = 4;
+ /**
+ * Bit flag indicating the match is based on language statistics.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int LANG_STATISTICS = 8;
+ //
+ // Private Data
+ //
+ private int fConfidence;
+ private CharsetRecognizer fRecognizer;
+ private byte[] fRawInput = null; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ private int fRawLength; // Length of data in fRawInput array.
+ private InputStream fInputStream = null; // User's input stream, or null if the user
+
+ /*
+ * Constructor. Implementation internal
+ */
+ CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+ fRecognizer = rec;
+ fConfidence = conf;
+
+ // The references to the original aplication input data must be copied out
+ // of the charset recognizer to here, in case the application resets the
+ // recognizer before using this CharsetMatch.
+ if (det.fInputStream == null) {
+ // We only want the existing input byte data if it came straight from the user,
+ // not if is just the head of a stream.
+ fRawInput = det.fRawInput;
+ fRawLength = det.fRawLength;
+ }
+ fInputStream = det.fInputStream;
+ }
+
+ /**
+ * Create a java.io.Reader for reading the Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * <p/>
+ * CAUTION: if the source of the byte data was an InputStream, a Reader
+ * can be created for only one matching char set using this method. If more
+ * than one charset needs to be tried, the caller will need to reset
+ * the InputStream and create InputStreamReaders itself, based on the charset name.
+ *
+ * @return the Reader for the Unicode character data.
+ *
+ * @stable ICU 3.4
+ */
+ public Reader getReader() {
+ InputStream inputStream = fInputStream;
+
+ if (inputStream == null) {
+ inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+ }
+
+ try {
+ inputStream.reset();
+ return new InputStreamReader(inputStream, getName());
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ *
+ * @return a String created from the converted input data.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString() throws java.io.IOException {
+ return getString(-1);
+
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * The length of the returned string is limited to the specified size;
+ * the string will be trunctated to this length if necessary. A limit value of
+ * zero or less is ignored, and treated as no limit.
+ *
+ * @param maxLength The maximium length of the String to be created when the
+ * source of the data is an input stream, or -1 for
+ * unlimited length.
+ * @return a String created from the converted input data.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString(int maxLength) throws java.io.IOException {
+ String result = null;
+ if (fInputStream != null) {
+ StringBuffer sb = new StringBuffer();
+ char[] buffer = new char[1024];
+ Reader reader = getReader();
+ int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
+ int bytesRead = 0;
+
+ while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+ sb.append(buffer, 0, bytesRead);
+ max -= bytesRead;
+ }
+
+ reader.close();
+
+ return sb.toString();
+ } else {
+ result = new String(fRawInput, getName());
+ }
+ return result;
+
+ }
+
+ /**
+ * Get an indication of the confidence in the charset detected.
+ * Confidence values range from 0-100, with larger numbers indicating
+ * a better match of the input data to the characteristics of the
+ * charset.
+ *
+ * @return the confidence in the charset match
+ *
+ * @stable ICU 3.4
+ */
+ public int getConfidence() {
+ return fConfidence;
+ }
+
+ /**
+ * Return flags indicating what it was about the input data
+ * that caused this charset to be considered as a possible match.
+ * The result is a bitfield containing zero or more of the flags
+ * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
+ * A result of zero means no information is available.
+ * <p>
+ * Note: currently, this method always returns zero.
+ * <p>
+ *
+ * @return the type of match found for this charset.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int getMatchType() {
+// TODO: create a list of enum-like constants for common combinations of types of matches.
+ return 0;
+ }
+
+ /**
+ * Get the name of the detected charset.
+ * The name will be one that can be used with other APIs on the
+ * platform that accept charset names. It is the "Canonical name"
+ * as defined by the class java.nio.charset.Charset; for
+ * charsets that are registered with the IANA charset registry,
+ * this is the MIME-preferred registerd name.
+ *
+ * @see java.nio.charset.Charset
+ * @see java.io.InputStreamReader
+ *
+ * @return The name of the charset.
+ *
+ * @stable ICU 3.4
+ */
+ public String getName() {
+ return fRecognizer.getName();
+ }
+
+ /**
+ * Get the ISO code for the language of the detected charset.
+ *
+ * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
+ *
+ * @stable ICU 3.4
+ */
+ public String getLanguage() {
+ return fRecognizer.getLanguage();
+ }
+
+ /**
+ * Compare to other CharsetMatch objects.
+ * Comparison is based on the match confidence value, which
+ * allows CharsetDetector.detectAll() to order its results.
+ *
+ * @param o the CharsetMatch object to compare against.
+ * @return a negative integer, zero, or a positive integer as the
+ * confidence level of this CharsetMatch
+ * is less than, equal to, or greater than that of
+ * the argument.
+ * @throws ClassCastException if the argument is not a CharsetMatch.
+ * @stable ICU 3.4
+ */
+ public int compareTo(CharsetMatch other) {
+ int compareResult = 0;
+ if (this.fConfidence > other.fConfidence) {
+ compareResult = 1;
+ } else if (this.fConfidence < other.fConfidence) {
+ compareResult = -1;
+ }
+ return compareResult;
+ }
+
+ /**
+ * compare this CharsetMatch to another based on confidence value
+ * @param o the CharsetMatch object to compare against
+ * @return true if equal
+ */
+ public boolean equals(Object o) {
+ if (o instanceof CharsetMatch) {
+ CharsetMatch that = (CharsetMatch) o;
+ return (this.fConfidence == that.fConfidence);
+ }
+
+ return false;
+ }
+
+ /**
+ * generates a hashCode based on the confidence value
+ * @return the hashCode
+ */
+ public int hashCode() {
+ return fConfidence;
+ }
+ // gave us a byte array.
+
+ public String toString() {
+ String s = "Match of " + fRecognizer.getName();
+ if (fRecognizer.getLanguage() != null) {
+ s += " in " + fRecognizer.getLanguage();
+ }
+ s += " with confidence " + fConfidence;
+ return s;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 16835d6..129c9a8 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,163 +1,163 @@
-/*
-*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
-* others. All Rights Reserved. *
-*******************************************************************************
-*/
-package org.apache.tika.parser.txt;
-
-/**
- * class CharsetRecog_2022 part of the ICU charset detection imlementation.
- * This is a superclass for the individual detectors for
- * each of the detectable members of the ISO 2022 family
- * of encodings.
- * <p/>
- * The separate classes are nested within this class.
- *
- * @internal
- */
-abstract class CharsetRecog_2022 extends CharsetRecognizer {
-
-
- /**
- * Matching function shared among the 2022 detectors JP, CN and KR
- * Counts up the number of legal an unrecognized escape sequences in
- * the sample of text, and computes a score based on the total number &
- * the proportion that fit the encoding.
- *
- * @param text the byte buffer containing text to analyse
- * @param textLen the size of the text in the byte.
- * @param escapeSequences the byte escape sequences to test for.
- * @return match quality, in the range of 0-100.
- */
- int match(byte[] text, int textLen, byte[][] escapeSequences) {
- int i, j;
- int escN;
- int hits = 0;
- int misses = 0;
- int shifts = 0;
- int quality;
- scanInput:
- for (i = 0; i < textLen; i++) {
- if (text[i] == 0x1b) {
- checkEscapes:
- for (escN = 0; escN < escapeSequences.length; escN++) {
- byte[] seq = escapeSequences[escN];
-
- if ((textLen - i) < seq.length) {
- continue checkEscapes;
- }
-
- for (j = 1; j < seq.length; j++) {
- if (seq[j] != text[i + j]) {
- continue checkEscapes;
- }
- }
-
- hits++;
- i += seq.length - 1;
- continue scanInput;
- }
-
- misses++;
- }
-
- if (text[i] == 0x0e || text[i] == 0x0f) {
- // Shift in/out
- shifts++;
- }
- }
-
- if (hits == 0) {
- return 0;
- }
-
- //
- // Initial quality is based on relative proportion of recongized vs.
- // unrecognized escape sequences.
- // All good: quality = 100;
- // half or less good: quality = 0;
- // linear inbetween.
- quality = (100 * hits - 100 * misses) / (hits + misses);
-
- // Back off quality if there were too few escape sequences seen.
- // Include shifts in this computation, so that KR does not get penalized
- // for having only a single Escape sequence, but many shifts.
- if (hits + shifts < 5) {
- quality -= (5 - (hits + shifts)) * 10;
- }
-
- if (quality < 0) {
- quality = 0;
- }
- return quality;
- }
-
-
- static class CharsetRecog_2022JP extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
- {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
- {0x1b, 0x24, 0x40}, // JIS C 6226-1978
- {0x1b, 0x24, 0x41}, // GB 2312-80
- {0x1b, 0x24, 0x42}, // JIS X 208-1983
- {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
- {0x1b, 0x28, 0x42}, // ASCII
- {0x1b, 0x28, 0x48}, // JIS-Roman
- {0x1b, 0x28, 0x49}, // Half-width katakana
- {0x1b, 0x28, 0x4a}, // JIS-Roman
- {0x1b, 0x2e, 0x41}, // ISO 8859-1
- {0x1b, 0x2e, 0x46} // ISO 8859-7
- };
-
- String getName() {
- return "ISO-2022-JP";
- }
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
- }
-
- static class CharsetRecog_2022KR extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x29, 0x43}
- };
-
- String getName() {
- return "ISO-2022-KR";
- }
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
-
- }
-
- static class CharsetRecog_2022CN extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
- {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
- {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
- {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
- {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
- {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
- {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
- {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
- {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
- {0x1b, 0x4e}, // SS2
- {0x1b, 0x4f}, // SS3
- };
-
- String getName() {
- return "ISO-2022-CN";
- }
-
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
- }
-
-}
-
+/*
+*******************************************************************************
+* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*/
+package org.apache.tika.parser.txt;
+
+/**
+ * class CharsetRecog_2022 part of the ICU charset detection imlementation.
+ * This is a superclass for the individual detectors for
+ * each of the detectable members of the ISO 2022 family
+ * of encodings.
+ * <p/>
+ * The separate classes are nested within this class.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_2022 extends CharsetRecognizer {
+
+
+ /**
+ * Matching function shared among the 2022 detectors JP, CN and KR
+ * Counts up the number of legal an unrecognized escape sequences in
+ * the sample of text, and computes a score based on the total number &
+ * the proportion that fit the encoding.
+ *
+ * @param text the byte buffer containing text to analyse
+ * @param textLen the size of the text in the byte.
+ * @param escapeSequences the byte escape sequences to test for.
+ * @return match quality, in the range of 0-100.
+ */
+ int match(byte[] text, int textLen, byte[][] escapeSequences) {
+ int i, j;
+ int escN;
+ int hits = 0;
+ int misses = 0;
+ int shifts = 0;
+ int quality;
+ scanInput:
+ for (i = 0; i < textLen; i++) {
+ if (text[i] == 0x1b) {
+ checkEscapes:
+ for (escN = 0; escN < escapeSequences.length; escN++) {
+ byte[] seq = escapeSequences[escN];
+
+ if ((textLen - i) < seq.length) {
+ continue checkEscapes;
+ }
+
+ for (j = 1; j < seq.length; j++) {
+ if (seq[j] != text[i + j]) {
+ continue checkEscapes;
+ }
+ }
+
+ hits++;
+ i += seq.length - 1;
+ continue scanInput;
+ }
+
+ misses++;
+ }
+
+ if (text[i] == 0x0e || text[i] == 0x0f) {
+ // Shift in/out
+ shifts++;
+ }
+ }
+
+ if (hits == 0) {
+ return 0;
+ }
+
+ //
+ // Initial quality is based on relative proportion of recongized vs.
+ // unrecognized escape sequences.
+ // All good: quality = 100;
+ // half or less good: quality = 0;
+ // linear inbetween.
+ quality = (100 * hits - 100 * misses) / (hits + misses);
+
+ // Back off quality if there were too few escape sequences seen.
+ // Include shifts in this computation, so that KR does not get penalized
+ // for having only a single Escape sequence, but many shifts.
+ if (hits + shifts < 5) {
+ quality -= (5 - (hits + shifts)) * 10;
+ }
+
+ if (quality < 0) {
+ quality = 0;
+ }
+ return quality;
+ }
+
+
+ static class CharsetRecog_2022JP extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
+ {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
+ {0x1b, 0x24, 0x40}, // JIS C 6226-1978
+ {0x1b, 0x24, 0x41}, // GB 2312-80
+ {0x1b, 0x24, 0x42}, // JIS X 208-1983
+ {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
+ {0x1b, 0x28, 0x42}, // ASCII
+ {0x1b, 0x28, 0x48}, // JIS-Roman
+ {0x1b, 0x28, 0x49}, // Half-width katakana
+ {0x1b, 0x28, 0x4a}, // JIS-Roman
+ {0x1b, 0x2e, 0x41}, // ISO 8859-1
+ {0x1b, 0x2e, 0x46} // ISO 8859-7
+ };
+
+ String getName() {
+ return "ISO-2022-JP";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+ }
+
+ static class CharsetRecog_2022KR extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x29, 0x43}
+ };
+
+ String getName() {
+ return "ISO-2022-KR";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+
+ }
+
+ static class CharsetRecog_2022CN extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
+ {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
+ {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
+ {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
+ {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
+ {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
+ {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
+ {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
+ {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
+ {0x1b, 0x4e}, // SS2
+ {0x1b, 0x4f}, // SS3
+ };
+
+ String getName() {
+ return "ISO-2022-CN";
+ }
+
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index ad69fa0..55a3957 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,99 +1,99 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Charset recognizer for UTF-8
- *
- * @internal
- */
-class CharsetRecog_UTF8 extends CharsetRecognizer {
-
- String getName() {
- return "UTF-8";
- }
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
- */
- int match(CharsetDetector det) {
- boolean hasBOM = false;
- int numValid = 0;
- int numInvalid = 0;
- byte input[] = det.fRawInput;
- int i;
- int trailBytes = 0;
- int confidence;
-
- if (det.fRawLength >= 3 &&
- (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
- hasBOM = true;
- }
-
- // Scan for multi-byte sequences
- for (i = 0; i < det.fRawLength; i++) {
- int b = input[i];
- if ((b & 0x80) == 0) {
- continue; // ASCII
- }
-
- // Hi bit on char found. Figure out how long the sequence should be
- if ((b & 0x0e0) == 0x0c0) {
- trailBytes = 1;
- } else if ((b & 0x0f0) == 0x0e0) {
- trailBytes = 2;
- } else if ((b & 0x0f8) == 0xf0) {
- trailBytes = 3;
- } else {
- numInvalid++;
- if (numInvalid > 5) {
- break;
- }
- trailBytes = 0;
- }
-
- // Verify that we've got the right number of trail bytes in the sequence
- for (; ; ) {
- i++;
- if (i >= det.fRawLength) {
- break;
- }
- b = input[i];
- if ((b & 0xc0) != 0x080) {
- numInvalid++;
- break;
- }
- if (--trailBytes == 0) {
- numValid++;
- break;
- }
- }
-
- }
-
- // Cook up some sort of confidence score, based on presense of a BOM
- // and the existence of valid and/or invalid multi-byte sequences.
- confidence = 0;
- if (hasBOM && numInvalid == 0) {
- confidence = 100;
- } else if (hasBOM && numValid > numInvalid * 10) {
- confidence = 80;
- } else if (numValid > 3 && numInvalid == 0) {
- confidence = 100;
- } else if (numValid > 0 && numInvalid == 0) {
- confidence = 80;
- } else if (numValid == 0 && numInvalid == 0) {
- // Plain ASCII.
- confidence = 10;
- } else if (numValid > numInvalid * 10) {
- // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
- confidence = 25;
- }
- return confidence;
- }
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Charset recognizer for UTF-8
+ *
+ * @internal
+ */
+class CharsetRecog_UTF8 extends CharsetRecognizer {
+
+ String getName() {
+ return "UTF-8";
+ }
+
+ /* (non-Javadoc)
+ * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+ */
+ int match(CharsetDetector det) {
+ boolean hasBOM = false;
+ int numValid = 0;
+ int numInvalid = 0;
+ byte input[] = det.fRawInput;
+ int i;
+ int trailBytes = 0;
+ int confidence;
+
+ if (det.fRawLength >= 3 &&
+ (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
+ hasBOM = true;
+ }
+
+ // Scan for multi-byte sequences
+ for (i = 0; i < det.fRawLength; i++) {
+ int b = input[i];
+ if ((b & 0x80) == 0) {
+ continue; // ASCII
+ }
+
+ // Hi bit on char found. Figure out how long the sequence should be
+ if ((b & 0x0e0) == 0x0c0) {
+ trailBytes = 1;
+ } else if ((b & 0x0f0) == 0x0e0) {
+ trailBytes = 2;
+ } else if ((b & 0x0f8) == 0xf0) {
+ trailBytes = 3;
+ } else {
+ numInvalid++;
+ if (numInvalid > 5) {
+ break;
+ }
+ trailBytes = 0;
+ }
+
+ // Verify that we've got the right number of trail bytes in the sequence
+ for (; ; ) {
+ i++;
+ if (i >= det.fRawLength) {
+ break;
+ }
+ b = input[i];
+ if ((b & 0xc0) != 0x080) {
+ numInvalid++;
+ break;
+ }
+ if (--trailBytes == 0) {
+ numValid++;
+ break;
+ }
+ }
+
+ }
+
+ // Cook up some sort of confidence score, based on presense of a BOM
+ // and the existence of valid and/or invalid multi-byte sequences.
+ confidence = 0;
+ if (hasBOM && numInvalid == 0) {
+ confidence = 100;
+ } else if (hasBOM && numValid > numInvalid * 10) {
+ confidence = 80;
+ } else if (numValid > 3 && numInvalid == 0) {
+ confidence = 100;
+ } else if (numValid > 0 && numInvalid == 0) {
+ confidence = 80;
+ } else if (numValid == 0 && numInvalid == 0) {
+ // Plain ASCII.
+ confidence = 10;
+ } else if (numValid > numInvalid * 10) {
+ // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
+ confidence = 25;
+ }
+ return confidence;
+ }
+
+}
[36/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
index cc22347..1695859 100644
--- a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.advanced.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.advanced.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/pom.xml b/tika-parser-modules/tika-parser-cad-module/pom.xml
index 6e7efb6..a9f8f31 100644
--- a/tika-parser-modules/tika-parser-cad-module/pom.xml
+++ b/tika-parser-modules/tika-parser-cad-module/pom.xml
@@ -1,56 +1,56 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-cad-module</artifactId>
- <name>Apache Tika parser CAD module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- <version>2.6</version>
- </dependency>
-
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-cad-module</artifactId>
+ <name>Apache Tika parser CAD module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.6</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
index 4a23b73..29a099c 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.cad.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.cad.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 3f29c1f..875c4ee 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -1,356 +1,356 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.dwg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.io.StringUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * DWG (CAD Drawing) parser. This is a very basic parser, which just
- * looks for bits of the headers.
- * Note that we use Apache POI for various parts of the processing, as
- * lots of the low level string/int/short concepts are the same.
- */
-public class DWGParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -7744232583079169119L;
-
- private static MediaType TYPE = MediaType.image("vnd.dwg");
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(TYPE);
- }
-
- /** The order of the fields in the header */
- private static final Property[] HEADER_PROPERTIES_ENTRIES = {
- TikaCoreProperties.TITLE,
- TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
- TikaCoreProperties.CREATOR,
- TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
- TikaCoreProperties.COMMENTS,
- TikaCoreProperties.MODIFIER,
- null, // Unknown?
- TikaCoreProperties.RELATION, // Hyperlink
- };
-
- /** For the 2000 file, they're indexed */
- private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
- null,
- TikaCoreProperties.RELATION, // 0x01
- TikaCoreProperties.TITLE, // 0x02
- TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
- TikaCoreProperties.CREATOR, // 0x04
- null,
- TikaCoreProperties.COMMENTS,// 0x06
- TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
- TikaCoreProperties.MODIFIER, // 0x08
- };
-
- private static final String HEADER_2000_PROPERTIES_MARKER_STR =
- "DWGPROPS COOKIE";
-
- private static final byte[] HEADER_2000_PROPERTIES_MARKER =
- new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
-
- static {
- StringUtil.putCompressedUnicode(
- HEADER_2000_PROPERTIES_MARKER_STR,
- HEADER_2000_PROPERTIES_MARKER, 0);
- }
-
- /**
- * How far to skip after the last standard property, before
- * we find any custom properties that might be there.
- */
- private static final int CUSTOM_PROPERTIES_SKIP = 20;
-
- /**
- * The value of padding bytes other than 0 in some DWG files.
- */
- private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, TikaException, SAXException {
- // First up, which version of the format are we handling?
- byte[] header = new byte[128];
- IOUtils.readFully(stream, header);
- String version = new String(header, 0, 6, "US-ASCII");
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- if (version.equals("AC1015")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipTo2000PropertyInfoSection(stream, header)) {
- get2000Props(stream,metadata,xhtml);
- }
- } else if (version.equals("AC1018")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2004Props(stream,metadata,xhtml);
- }
- } else if (version.equals("AC1021") || version.equals("AC1024")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2007and2010Props(stream,metadata,xhtml);
- }
- } else {
- throw new TikaException(
- "Unsupported AutoCAD drawing version: " + version);
- }
-
- xhtml.endDocument();
- }
-
- /**
- * Stored as US-ASCII
- */
- private void get2004Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- // Standard properties
- for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
- String headerValue = read2004String(stream);
- handleHeader(i, headerValue, metadata, xhtml);
- }
-
- // Custom properties
- int customCount = skipToCustomProperties(stream);
- for (int i = 0; i < customCount; i++) {
- String propName = read2004String(stream);
- String propValue = read2004String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
- }
- }
-
- private String read2004String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
-
- byte[] stringData = new byte[stringLen];
- IOUtils.readFully(stream, stringData);
-
- // Often but not always null terminated
- if (stringData[stringLen-1] == 0) {
- stringLen--;
- }
- String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
- return value;
- }
-
- /**
- * Stored as UCS2, so 16 bit "unicode"
- */
- private void get2007and2010Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- // Standard properties
- for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
- String headerValue = read2007and2010String(stream);
- handleHeader(i, headerValue, metadata, xhtml);
- }
-
- // Custom properties
- int customCount = skipToCustomProperties(stream);
- for (int i = 0; i < customCount; i++) {
- String propName = read2007and2010String(stream);
- String propValue = read2007and2010String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
- }
- }
-
- private String read2007and2010String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
-
- byte[] stringData = new byte[stringLen * 2];
- IOUtils.readFully(stream, stringData);
- String value = StringUtil.getFromUnicodeLE(stringData);
-
- // Some strings are null terminated
- if(value.charAt(value.length()-1) == 0) {
- value = value.substring(0, value.length()-1);
- }
-
- return value;
- }
-
- private void get2000Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- int propCount = 0;
- while(propCount < 30) {
- int propIdx = EndianUtils.readUShortLE(stream);
- int length = EndianUtils.readUShortLE(stream);
- int valueType = stream.read();
-
- if(propIdx == 0x28) {
- // This one seems not to follow the pattern
- length = 0x19;
- } else if(propIdx == 90) {
- // We think this means the end of properties
- break;
- }
-
- byte[] value = new byte[length];
- IOUtils.readFully(stream, value);
- if(valueType == 0x1e) {
- // Normal string, good
- String val = StringUtil.getFromCompressedUnicode(value, 0, length);
-
- // Is it one we can look up by index?
- if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
- metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
- xhtml.element("p", val);
- } else if(propIdx == 0x012c) {
- int splitAt = val.indexOf('=');
- if(splitAt > -1) {
- String propName = val.substring(0, splitAt);
- String propVal = val.substring(splitAt+1);
- metadata.add(propName, propVal);
- }
- }
- } else {
- // No idea...
- }
-
- propCount++;
- }
- }
-
- private void handleHeader(
- int headerNumber, String value, Metadata metadata,
- XHTMLContentHandler xhtml) throws SAXException {
- if(value == null || value.length() == 0) {
- return;
- }
-
- Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
- if(headerProp != null) {
- metadata.set(headerProp, value);
- }
-
- xhtml.element("p", value);
- }
-
- /**
- * Grab the offset, then skip there
- */
- private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
- throws IOException, TikaException {
- // The offset is stored in the header from 0x20 onwards
- long offsetToSection = EndianUtils.getLongLE(header, 0x20);
-
- // Sanity check the offset. Some files seem to use a different format,
- // and the offset isn't available at 0x20. Until we can work out how
- // to find the offset in those files, skip them if detected
- if (offsetToSection > 0xa00000l) {
- // Header should never be more than 10mb into the file, something is wrong
- offsetToSection = 0;
- }
-
- // Work out how far to skip, and sanity check
- long toSkip = offsetToSection - header.length;
- if(offsetToSection == 0){
- return false;
- }
- while (toSkip > 0) {
- byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
- IOUtils.readFully(stream, skip);
- toSkip -= skip.length;
- }
- return true;
- }
-
- /**
- * We think it can be anywhere...
- */
- private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
- throws IOException {
- int val = 0;
- while(val != -1) {
- val = stream.read();
- if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
- boolean going = true;
- for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
- val = stream.read();
- if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
- }
- if(going) {
- // Bingo, found it
- return true;
- }
- }
- }
- return false;
- }
-
- private int skipToCustomProperties(InputStream stream)
- throws IOException, TikaException {
- // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
- byte[] padding = new byte[4];
- IOUtils.readFully(stream, padding);
- if((padding[0] == 0 && padding[1] == 0 &&
- padding[2] == 0 && padding[3] == 0) ||
- (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
- padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
- padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
- padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
-
- // Looks hopeful, skip on
- padding = new byte[CUSTOM_PROPERTIES_SKIP];
- IOUtils.readFully(stream, padding);
-
- // We should now have the count
- int count = EndianUtils.readUShortLE(stream);
-
- // Sanity check it
- if(count > 0 && count < 0x7f) {
- // Looks plausible
- return count;
- } else {
- // No properties / count is too high to trust
- return 0;
- }
- } else {
- // No padding. That probably means no custom props
- return 0;
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.io.StringUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * DWG (CAD Drawing) parser. This is a very basic parser, which just
+ * looks for bits of the headers.
+ * Note that we use Apache POI for various parts of the processing, as
+ * lots of the low level string/int/short concepts are the same.
+ */
+public class DWGParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -7744232583079169119L;
+
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ /** The order of the fields in the header */
+ private static final Property[] HEADER_PROPERTIES_ENTRIES = {
+ TikaCoreProperties.TITLE,
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ TikaCoreProperties.CREATOR,
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
+ TikaCoreProperties.COMMENTS,
+ TikaCoreProperties.MODIFIER,
+ null, // Unknown?
+ TikaCoreProperties.RELATION, // Hyperlink
+ };
+
+ /** For the 2000 file, they're indexed */
+ private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
+ null,
+ TikaCoreProperties.RELATION, // 0x01
+ TikaCoreProperties.TITLE, // 0x02
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
+ TikaCoreProperties.CREATOR, // 0x04
+ null,
+ TikaCoreProperties.COMMENTS,// 0x06
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
+ TikaCoreProperties.MODIFIER, // 0x08
+ };
+
+ private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+ "DWGPROPS COOKIE";
+
+ private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+ new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+ static {
+ StringUtil.putCompressedUnicode(
+ HEADER_2000_PROPERTIES_MARKER_STR,
+ HEADER_2000_PROPERTIES_MARKER, 0);
+ }
+
+ /**
+ * How far to skip after the last standard property, before
+ * we find any custom properties that might be there.
+ */
+ private static final int CUSTOM_PROPERTIES_SKIP = 20;
+
+ /**
+ * The value of padding bytes other than 0 in some DWG files.
+ */
+ private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+ // First up, which version of the format are we handling?
+ byte[] header = new byte[128];
+ IOUtils.readFully(stream, header);
+ String version = new String(header, 0, 6, "US-ASCII");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ if (version.equals("AC1015")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipTo2000PropertyInfoSection(stream, header)) {
+ get2000Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1018")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2004Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1021") || version.equals("AC1024")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2007and2010Props(stream,metadata,xhtml);
+ }
+ } else {
+ throw new TikaException(
+ "Unsupported AutoCAD drawing version: " + version);
+ }
+
+ xhtml.endDocument();
+ }
+
+ /**
+ * Stored as US-ASCII
+ */
+ private void get2004Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2004String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2004String(stream);
+ String propValue = read2004String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2004String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen];
+ IOUtils.readFully(stream, stringData);
+
+ // Often but not always null terminated
+ if (stringData[stringLen-1] == 0) {
+ stringLen--;
+ }
+ String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+ return value;
+ }
+
+ /**
+ * Stored as UCS2, so 16 bit "unicode"
+ */
+ private void get2007and2010Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2007and2010String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2007and2010String(stream);
+ String propValue = read2007and2010String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2007and2010String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen * 2];
+ IOUtils.readFully(stream, stringData);
+ String value = StringUtil.getFromUnicodeLE(stringData);
+
+ // Some strings are null terminated
+ if(value.charAt(value.length()-1) == 0) {
+ value = value.substring(0, value.length()-1);
+ }
+
+ return value;
+ }
+
+ private void get2000Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ int propCount = 0;
+ while(propCount < 30) {
+ int propIdx = EndianUtils.readUShortLE(stream);
+ int length = EndianUtils.readUShortLE(stream);
+ int valueType = stream.read();
+
+ if(propIdx == 0x28) {
+ // This one seems not to follow the pattern
+ length = 0x19;
+ } else if(propIdx == 90) {
+ // We think this means the end of properties
+ break;
+ }
+
+ byte[] value = new byte[length];
+ IOUtils.readFully(stream, value);
+ if(valueType == 0x1e) {
+ // Normal string, good
+ String val = StringUtil.getFromCompressedUnicode(value, 0, length);
+
+ // Is it one we can look up by index?
+ if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+ metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+ xhtml.element("p", val);
+ } else if(propIdx == 0x012c) {
+ int splitAt = val.indexOf('=');
+ if(splitAt > -1) {
+ String propName = val.substring(0, splitAt);
+ String propVal = val.substring(splitAt+1);
+ metadata.add(propName, propVal);
+ }
+ }
+ } else {
+ // No idea...
+ }
+
+ propCount++;
+ }
+ }
+
+ private void handleHeader(
+ int headerNumber, String value, Metadata metadata,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if(value == null || value.length() == 0) {
+ return;
+ }
+
+ Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
+ if(headerProp != null) {
+ metadata.set(headerProp, value);
+ }
+
+ xhtml.element("p", value);
+ }
+
+ /**
+ * Grab the offset, then skip there
+ */
+ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException, TikaException {
+ // The offset is stored in the header from 0x20 onwards
+ long offsetToSection = EndianUtils.getLongLE(header, 0x20);
+
+ // Sanity check the offset. Some files seem to use a different format,
+ // and the offset isn't available at 0x20. Until we can work out how
+ // to find the offset in those files, skip them if detected
+ if (offsetToSection > 0xa00000l) {
+ // Header should never be more than 10mb into the file, something is wrong
+ offsetToSection = 0;
+ }
+
+ // Work out how far to skip, and sanity check
+ long toSkip = offsetToSection - header.length;
+ if(offsetToSection == 0){
+ return false;
+ }
+ while (toSkip > 0) {
+ byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
+ IOUtils.readFully(stream, skip);
+ toSkip -= skip.length;
+ }
+ return true;
+ }
+
+ /**
+ * We think it can be anywhere...
+ */
+ private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException {
+ int val = 0;
+ while(val != -1) {
+ val = stream.read();
+ if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+ boolean going = true;
+ for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+ val = stream.read();
+ if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+ }
+ if(going) {
+ // Bingo, found it
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private int skipToCustomProperties(InputStream stream)
+ throws IOException, TikaException {
+ // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
+ byte[] padding = new byte[4];
+ IOUtils.readFully(stream, padding);
+ if((padding[0] == 0 && padding[1] == 0 &&
+ padding[2] == 0 && padding[3] == 0) ||
+ (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
+ padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+ padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+ padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+
+ // Looks hopeful, skip on
+ padding = new byte[CUSTOM_PROPERTIES_SKIP];
+ IOUtils.readFully(stream, padding);
+
+ // We should now have the count
+ int count = EndianUtils.readUShortLE(stream);
+
+ // Sanity check it
+ if(count > 0 && count < 0x7f) {
+ // Looks plausible
+ return count;
+ } else {
+ // No properties / count is too high to trust
+ return 0;
+ }
+ } else {
+ // No padding. That probably means no custom props
+ return 0;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/pom.xml b/tika-parser-modules/tika-parser-code-module/pom.xml
index cf59c0e..5d33f82 100644
--- a/tika-parser-modules/tika-parser-code-module/pom.xml
+++ b/tika-parser-modules/tika-parser-code-module/pom.xml
@@ -1,69 +1,69 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-code-module</artifactId>
- <name>Apache Tika parser code module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.ow2.asm</groupId>
- <artifactId>asm</artifactId>
- <version>5.0.4</version>
- </dependency>
- <dependency>
- <groupId>org.codelibs</groupId>
- <artifactId>jhighlight</artifactId>
- <version>1.0.2</version>
- </dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-code-module</artifactId>
+ <name>Apache Tika parser code module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm</artifactId>
+ <version>5.0.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codelibs</groupId>
+ <artifactId>jhighlight</artifactId>
+ <version>1.0.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
index 040618d..095e643 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.code.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.code.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
index 48f8cbf..481046f 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for Java .class files.
- */
-public class ClassParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -3531388963354454357L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("java-vm"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- new XHTMLClassVisitor(handler, metadata).parse(stream);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3531388963354454357L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("java-vm"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ new XHTMLClassVisitor(handler, metadata).parse(stream);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 03deb43..c8ea317 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -1,323 +1,323 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.objectweb.asm.AnnotationVisitor;
-import org.objectweb.asm.Attribute;
-import org.objectweb.asm.ClassReader;
-import org.objectweb.asm.ClassVisitor;
-import org.objectweb.asm.FieldVisitor;
-import org.objectweb.asm.MethodVisitor;
-import org.objectweb.asm.Opcodes;
-import org.objectweb.asm.Type;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Class visitor that generates XHTML SAX events to describe the
- * contents of the visited class.
- */
-class XHTMLClassVisitor extends ClassVisitor {
-
- private final XHTMLContentHandler xhtml;
-
- private final Metadata metadata;
-
- private Type type;
-
- private String packageName;
-
- public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
- super(Opcodes.ASM5);
- this.xhtml = new XHTMLContentHandler(handler, metadata);
- this.metadata = metadata;
- }
-
- public void parse(InputStream stream)
- throws TikaException, SAXException, IOException {
- try {
- ClassReader reader = new ClassReader(stream);
- reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
- } catch (RuntimeException e) {
- if (e.getCause() instanceof SAXException) {
- throw (SAXException) e.getCause();
- } else {
- throw new TikaException("Failed to parse a Java class", e);
- }
- }
- }
-
- public void visit(
- int version, int access, String name, String signature,
- String superName, String[] interfaces) {
- type = Type.getObjectType(name);
-
- String className = type.getClassName();
- int dot = className.lastIndexOf('.');
- if (dot != -1) {
- packageName = className.substring(0, dot);
- className = className.substring(dot + 1);
- }
-
- metadata.set(TikaCoreProperties.TITLE, className);
- metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
-
- try {
- xhtml.startDocument();
- xhtml.startElement("pre");
-
- if (packageName != null) {
- writeKeyword("package");
- xhtml.characters(" " + packageName + ";\n");
- }
-
- writeAccess(access);
- if (isSet(access, Opcodes.ACC_INTERFACE)) {
- writeKeyword("interface");
- writeSpace();
- writeType(type);
- writeSpace();
- writeInterfaces("extends", interfaces);
- } else if (isSet(access, Opcodes.ACC_ENUM)) {
- writeKeyword("enum");
- writeSpace();
- writeType(type);
- writeSpace();
- } else {
- writeKeyword("class");
- writeSpace();
- writeType(type);
- writeSpace();
- if (superName != null) {
- Type superType = Type.getObjectType(superName);
- if (!superType.getClassName().equals("java.lang.Object")) {
- writeKeyword("extends");
- writeSpace();
- writeType(superType);
- writeSpace();
- }
- }
- writeInterfaces("implements", interfaces);
- }
- xhtml.characters("{\n");
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- private void writeInterfaces(String keyword, String[] interfaces)
- throws SAXException {
- if (interfaces != null && interfaces.length > 0) {
- writeKeyword(keyword);
- String separator = " ";
- for (String iface : interfaces) {
- xhtml.characters(separator);
- writeType(Type.getObjectType(iface));
- separator = ", ";
- }
- writeSpace();
- }
- }
-
- public void visitEnd() {
- try {
- xhtml.characters("}\n");
- xhtml.endElement("pre");
- xhtml.endDocument();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Ignored.
- */
- public void visitOuterClass(String owner, String name, String desc) {
- }
-
- /**
- * Ignored.
- */
- public void visitSource(String source, String debug) {
- }
-
-
- /**
- * Ignored.
- */
- public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
- return null;
- }
-
- /**
- * Ignored.
- */
- public void visitAttribute(Attribute attr) {
- }
-
- /**
- * Ignored.
- */
- public void visitInnerClass(
- String name, String outerName, String innerName, int access) {
- }
-
- /**
- * Visits a field.
- */
- public FieldVisitor visitField(
- int access, String name, String desc, String signature,
- Object value) {
- if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
- try {
- xhtml.characters(" ");
- writeAccess(access);
- writeType(Type.getType(desc));
- writeSpace();
- writeIdentifier(name);
-
- if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
- xhtml.characters(" = ");
- xhtml.characters(value.toString());
- }
-
- writeSemicolon();
- writeNewline();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- return null;
- }
-
- /**
- * Visits a method.
- */
- public MethodVisitor visitMethod(
- int access, String name, String desc, String signature,
- String[] exceptions) {
- if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
- try {
- xhtml.characters(" ");
- writeAccess(access);
- writeType(Type.getReturnType(desc));
- writeSpace();
- if ("<init>".equals(name)) {
- writeType(type);
- } else {
- writeIdentifier(name);
- }
-
- xhtml.characters("(");
- String separator = "";
- for (Type arg : Type.getArgumentTypes(desc)) {
- xhtml.characters(separator);
- writeType(arg);
- separator = ", ";
- }
- xhtml.characters(")");
-
- if (exceptions != null && exceptions.length > 0) {
- writeSpace();
- writeKeyword("throws");
- separator = " ";
- for (String exception : exceptions) {
- xhtml.characters(separator);
- writeType(Type.getObjectType(exception));
- separator = ", ";
- }
- }
-
- writeSemicolon();
- writeNewline();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- return null;
- }
-
- private void writeIdentifier(String identifier) throws SAXException {
- xhtml.startElement("span", "class", "java-identifier");
- xhtml.characters(identifier);
- xhtml.endElement("span");
- }
-
- private void writeKeyword(String keyword) throws SAXException {
- xhtml.startElement("span", "class", "java-keyword");
- xhtml.characters(keyword);
- xhtml.endElement("span");
- }
-
- private void writeSemicolon() throws SAXException {
- xhtml.characters(";");
- }
-
- private void writeSpace() throws SAXException {
- xhtml.characters(" ");
- }
-
- private void writeNewline() throws SAXException {
- xhtml.characters("\n");
- }
-
- private void writeAccess(int access) throws SAXException {
- writeAccess(access, Opcodes.ACC_PRIVATE, "private");
- writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
- writeAccess(access, Opcodes.ACC_PUBLIC, "public");
- writeAccess(access, Opcodes.ACC_STATIC, "static");
- writeAccess(access, Opcodes.ACC_FINAL, "final");
- writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
- writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
- writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
- writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
- writeAccess(access, Opcodes.ACC_NATIVE, "native");
- }
-
- private void writeAccess(int access, int code, String keyword)
- throws SAXException {
- if (isSet(access, code)) {
- writeKeyword(keyword);
- xhtml.characters(" ");
- }
- }
-
- private void writeType(Type type) throws SAXException {
- String name = type.getClassName();
- if (name.startsWith(packageName + ".")) {
- xhtml.characters(name.substring(packageName.length() + 1));
- } else if (name.startsWith("java.lang.")) {
- xhtml.characters(name.substring("java.lang.".length()));
- } else {
- xhtml.characters(name);
- }
- }
-
- private static boolean isSet(int value, int flag) {
- return (value & flag) != 0;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.objectweb.asm.AnnotationVisitor;
+import org.objectweb.asm.Attribute;
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+import org.objectweb.asm.Type;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Class visitor that generates XHTML SAX events to describe the
+ * contents of the visited class.
+ */
+class XHTMLClassVisitor extends ClassVisitor {
+
+ private final XHTMLContentHandler xhtml;
+
+ private final Metadata metadata;
+
+ private Type type;
+
+ private String packageName;
+
+ public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
+ super(Opcodes.ASM5);
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.metadata = metadata;
+ }
+
+ public void parse(InputStream stream)
+ throws TikaException, SAXException, IOException {
+ try {
+ ClassReader reader = new ClassReader(stream);
+ reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
+ } catch (RuntimeException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Failed to parse a Java class", e);
+ }
+ }
+ }
+
+ public void visit(
+ int version, int access, String name, String signature,
+ String superName, String[] interfaces) {
+ type = Type.getObjectType(name);
+
+ String className = type.getClassName();
+ int dot = className.lastIndexOf('.');
+ if (dot != -1) {
+ packageName = className.substring(0, dot);
+ className = className.substring(dot + 1);
+ }
+
+ metadata.set(TikaCoreProperties.TITLE, className);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
+
+ try {
+ xhtml.startDocument();
+ xhtml.startElement("pre");
+
+ if (packageName != null) {
+ writeKeyword("package");
+ xhtml.characters(" " + packageName + ";\n");
+ }
+
+ writeAccess(access);
+ if (isSet(access, Opcodes.ACC_INTERFACE)) {
+ writeKeyword("interface");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ writeInterfaces("extends", interfaces);
+ } else if (isSet(access, Opcodes.ACC_ENUM)) {
+ writeKeyword("enum");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ } else {
+ writeKeyword("class");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ if (superName != null) {
+ Type superType = Type.getObjectType(superName);
+ if (!superType.getClassName().equals("java.lang.Object")) {
+ writeKeyword("extends");
+ writeSpace();
+ writeType(superType);
+ writeSpace();
+ }
+ }
+ writeInterfaces("implements", interfaces);
+ }
+ xhtml.characters("{\n");
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void writeInterfaces(String keyword, String[] interfaces)
+ throws SAXException {
+ if (interfaces != null && interfaces.length > 0) {
+ writeKeyword(keyword);
+ String separator = " ";
+ for (String iface : interfaces) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(iface));
+ separator = ", ";
+ }
+ writeSpace();
+ }
+ }
+
+ public void visitEnd() {
+ try {
+ xhtml.characters("}\n");
+ xhtml.endElement("pre");
+ xhtml.endDocument();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitOuterClass(String owner, String name, String desc) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitSource(String source, String debug) {
+ }
+
+
+ /**
+ * Ignored.
+ */
+ public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
+ return null;
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitAttribute(Attribute attr) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitInnerClass(
+ String name, String outerName, String innerName, int access) {
+ }
+
+ /**
+ * Visits a field.
+ */
+ public FieldVisitor visitField(
+ int access, String name, String desc, String signature,
+ Object value) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getType(desc));
+ writeSpace();
+ writeIdentifier(name);
+
+ if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
+ xhtml.characters(" = ");
+ xhtml.characters(value.toString());
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Visits a method.
+ */
+ public MethodVisitor visitMethod(
+ int access, String name, String desc, String signature,
+ String[] exceptions) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getReturnType(desc));
+ writeSpace();
+ if ("<init>".equals(name)) {
+ writeType(type);
+ } else {
+ writeIdentifier(name);
+ }
+
+ xhtml.characters("(");
+ String separator = "";
+ for (Type arg : Type.getArgumentTypes(desc)) {
+ xhtml.characters(separator);
+ writeType(arg);
+ separator = ", ";
+ }
+ xhtml.characters(")");
+
+ if (exceptions != null && exceptions.length > 0) {
+ writeSpace();
+ writeKeyword("throws");
+ separator = " ";
+ for (String exception : exceptions) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(exception));
+ separator = ", ";
+ }
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ private void writeIdentifier(String identifier) throws SAXException {
+ xhtml.startElement("span", "class", "java-identifier");
+ xhtml.characters(identifier);
+ xhtml.endElement("span");
+ }
+
+ private void writeKeyword(String keyword) throws SAXException {
+ xhtml.startElement("span", "class", "java-keyword");
+ xhtml.characters(keyword);
+ xhtml.endElement("span");
+ }
+
+ private void writeSemicolon() throws SAXException {
+ xhtml.characters(";");
+ }
+
+ private void writeSpace() throws SAXException {
+ xhtml.characters(" ");
+ }
+
+ private void writeNewline() throws SAXException {
+ xhtml.characters("\n");
+ }
+
+ private void writeAccess(int access) throws SAXException {
+ writeAccess(access, Opcodes.ACC_PRIVATE, "private");
+ writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
+ writeAccess(access, Opcodes.ACC_PUBLIC, "public");
+ writeAccess(access, Opcodes.ACC_STATIC, "static");
+ writeAccess(access, Opcodes.ACC_FINAL, "final");
+ writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
+ writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
+ writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
+ writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
+ writeAccess(access, Opcodes.ACC_NATIVE, "native");
+ }
+
+ private void writeAccess(int access, int code, String keyword)
+ throws SAXException {
+ if (isSet(access, code)) {
+ writeKeyword(keyword);
+ xhtml.characters(" ");
+ }
+ }
+
+ private void writeType(Type type) throws SAXException {
+ String name = type.getClassName();
+ if (name.startsWith(packageName + ".")) {
+ xhtml.characters(name.substring(packageName.length() + 1));
+ } else if (name.startsWith("java.lang.")) {
+ xhtml.characters(name.substring("java.lang.".length()));
+ } else {
+ xhtml.characters(name);
+ }
+ }
+
+ private static boolean isSet(int value, int flag) {
+ return (value & flag) != 0;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index 63e4bf6..d17bde7 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -1,142 +1,142 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.code;
-
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import com.uwyn.jhighlight.renderer.Renderer;
-import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
-/**
- * Generic Source code parser for Java, Groovy, C++.
- * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
- *
- * @author Hong-Thai.Nguyen
- * @since 1.6
- */
-public class SourceCodeParser implements Parser {
-
- private static final long serialVersionUID = -4543476498190054160L;
-
- private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
-
- private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
- private static final long serialVersionUID = -741976157563751152L;
- {
- put(MediaType.text("x-c++src"), CPP);
- put(MediaType.text("x-java-source"), JAVA);
- put(MediaType.text("x-groovy"), GROOVY);
- }
- };
-
- private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
-
- //Parse the HTML document
- private static final Schema HTML_SCHEMA = new HTMLSchema();
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return TYPES_TO_RENDERER.keySet();
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- try (AutoDetectReader reader = new AutoDetectReader(
- new CloseShieldInputStream(stream), metadata,
- context.get(ServiceLoader.class, LOADER))) {
- Charset charset = reader.getCharset();
- String mediaType = metadata.get(Metadata.CONTENT_TYPE);
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (mediaType != null && name != null) {
- MediaType type = MediaType.parse(mediaType);
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
- metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- StringBuilder out = new StringBuilder();
- String line;
- int nbLines = 0;
- while ((line = reader.readLine()) != null) {
- out.append(line + System.getProperty("line.separator"));
- String author = parserAuthor(line);
- if (author != null) {
- metadata.add(TikaCoreProperties.CREATOR, author);
- }
- nbLines ++;
- }
- metadata.set("LoC", String.valueOf(nbLines));
- Renderer renderer = getRenderer(type.toString());
-
- String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
-
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
- org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
- parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- parser.setContentHandler(handler);
- parser.parse(new InputSource(new StringReader(codeAsHtml)));
- }
- }
-
- }
-
- private Renderer getRenderer(String mimeType) {
- MediaType mt = MediaType.parse(mimeType);
- String type = TYPES_TO_RENDERER.get(mt);
- if (type == null) {
- throw new RuntimeException("unparseable content type " + mimeType);
- }
- return XhtmlRendererFactory.getRenderer(type);
- }
-
-
- private String parserAuthor(String line) {
- Matcher m = authorPattern.matcher(line);
- if (m.find()) {
- return m.group(1).trim();
- }
-
- return null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.uwyn.jhighlight.renderer.Renderer;
+import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
+/**
+ * Generic Source code parser for Java, Groovy, C++.
+ * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
+ *
+ * @author Hong-Thai.Nguyen
+ * @since 1.6
+ */
+public class SourceCodeParser implements Parser {
+
+ private static final long serialVersionUID = -4543476498190054160L;
+
+ private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
+
+ private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
+ private static final long serialVersionUID = -741976157563751152L;
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
+
+ private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
+
+ //Parse the HTML document
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return TYPES_TO_RENDERER.keySet();
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata,
+ context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (mediaType != null && name != null) {
+ MediaType type = MediaType.parse(mediaType);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ StringBuilder out = new StringBuilder();
+ String line;
+ int nbLines = 0;
+ while ((line = reader.readLine()) != null) {
+ out.append(line + System.getProperty("line.separator"));
+ String author = parserAuthor(line);
+ if (author != null) {
+ metadata.add(TikaCoreProperties.CREATOR, author);
+ }
+ nbLines ++;
+ }
+ metadata.set("LoC", String.valueOf(nbLines));
+ Renderer renderer = getRenderer(type.toString());
+
+ String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+ parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ parser.setContentHandler(handler);
+ parser.parse(new InputSource(new StringReader(codeAsHtml)));
+ }
+ }
+
+ }
+
+ private Renderer getRenderer(String mimeType) {
+ MediaType mt = MediaType.parse(mimeType);
+ String type = TYPES_TO_RENDERER.get(mt);
+ if (type == null) {
+ throw new RuntimeException("unparseable content type " + mimeType);
+ }
+ return XhtmlRendererFactory.getRenderer(type);
+ }
+
+
+ private String parserAuthor(String line) {
+ Matcher m = authorPattern.matcher(line);
+ if (m.find()) {
+ return m.group(1).trim();
+ }
+
+ return null;
+ }
+}
[23/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
index 101b26b..51dc5a5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
@@ -1,327 +1,327 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.util.concurrent.CancellationException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
-import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-public class ChmLzxState implements Cloneable {
- /* Class' members */
- private int window; /* the actual decoding window */
- private long window_size; /* window size (32Kb through 2Mb) */
- private int window_position; /* current offset within the window */
- private int main_tree_elements; /* number of main tree elements */
- private LzxState hadStarted; /* have we started decoding at all yet? */
- private int block_type; /* type of this block */
- private int block_length; /* uncompressed length of this block */
- private int block_remaining; /* uncompressed bytes still left to decode */
- private int frames_read; /* the number of CFDATA blocks processed */
- private int intel_file_size; /* magic header value used for transform */
- private long intel_current_possition; /* current offset in transform space */
- private IntelState intel_state; /* have we seen any translatable data yet? */
- private long R0; /* for the LRU offset system */
- private long R1; /* for the LRU offset system */
- private long R2; /* for the LRU offset system */
-
- // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
- protected short[] mainTreeLengtsTable;
- protected short[] mainTreeTable;
-
- protected short[] lengthTreeTable;
- protected short[] lengthTreeLengtsTable;
-
- protected short[] alignedLenTable;
- protected short[] alignedTreeTable;
-
- @Override
- public ChmLzxState clone() {
- try {
- ChmLzxState clone = (ChmLzxState)super.clone();
- clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
- clone.mainTreeTable = arrayClone(mainTreeTable);
- clone.lengthTreeTable = arrayClone(lengthTreeTable);
- clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
- clone.alignedLenTable = arrayClone(alignedLenTable);
- clone.alignedTreeTable = arrayClone(alignedTreeTable);
- return clone;
- } catch (CloneNotSupportedException ex) {
- return null;
- }
- }
-
- protected short[] getMainTreeTable() {
- return mainTreeTable;
- }
-
- protected short[] getAlignedTreeTable() {
- return alignedTreeTable;
- }
-
- protected void setAlignedTreeTable(short[] alignedTreeTable) {
- this.alignedTreeTable = alignedTreeTable;
- }
-
- protected short[] getLengthTreeTable() throws TikaException {
- if (lengthTreeTable != null)
- return this.lengthTreeTable;
- else
- throw new ChmParsingException("lengthTreeTable is null");
- }
-
- protected void setLengthTreeTable(short[] lengthTreeTable) {
- this.lengthTreeTable = lengthTreeTable;
- }
-
- protected void setMainTreeTable(short[] mainTreeTable) {
- this.mainTreeTable = mainTreeTable;
- }
-
- protected short[] getAlignedLenTable() {
- return this.alignedLenTable;
- }
-
- protected void setAlignedLenTable(short[] alignedLenTable) {
- this.alignedLenTable = alignedLenTable;
- }
-
- /**
- * It suits for informative outlook
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("actual decoding window:=" + getWindow()
- + System.getProperty("line.separator"));
- sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
- + System.getProperty("line.separator"));
- sb.append("current offset within the window:=" + getWindowPosition()
- + System.getProperty("line.separator"));
- sb.append("number of main tree elements:=" + getMainTreeElements()
- + System.getProperty("line.separator"));
- sb.append("have we started decoding at all yet?:=" + getHadStarted()
- + System.getProperty("line.separator"));
- sb.append("type of this block:=" + getBlockType()
- + System.getProperty("line.separator"));
- sb.append("uncompressed length of this block:=" + getBlockLength()
- + System.getProperty("line.separator"));
- sb.append("uncompressed bytes still left to decode:="
- + getBlockRemaining() + System.getProperty("line.separator"));
- sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
- + System.getProperty("line.separator"));
- sb.append("magic header value used for transform:="
- + getIntelFileSize() + System.getProperty("line.separator"));
- sb.append("current offset in transform space:="
- + getIntelCurrentPossition()
- + System.getProperty("line.separator"));
- sb.append("have we seen any translatable data yet?:=" + getIntelState()
- + System.getProperty("line.separator"));
- sb.append("R0 for the LRU offset system:=" + getR0()
- + System.getProperty("line.separator"));
- sb.append("R1 for the LRU offset system:=" + getR1()
- + System.getProperty("line.separator"));
- sb.append("R2 for the LRU offset system:=" + getR2()
- + System.getProperty("line.separator"));
- sb.append("main tree length:=" + getMainTreeLengtsTable().length
- + System.getProperty("line.separator"));
- sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- public ChmLzxState(int window) throws TikaException {
- if (window >= 0) {
- int position_slots;
- int win = ChmCommons.getWindowSize(window);
- setWindowSize(1 << win);
- /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
- if (win < 15 || win > 21)
- throw new ChmParsingException("window less than 15 or window greater than 21");
-
- /* Calculates required position slots */
- if (win == 20)
- position_slots = 42;
- else if (win == 21)
- position_slots = 50;
- else
- position_slots = win << 1;
- //TODO: position_slots is not used ?
- setR0(1);
- setR1(1);
- setR2(1);
- setMainTreeElements(512);
- setHadStarted(LzxState.NOT_STARTED_DECODING);
- setFramesRead(0);
- setBlockRemaining(0);
- setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
- setIntelCurrentPossition(0);
- setIntelState(IntelState.NOT_STARTED);
- setWindowPosition(0);
- setMainTreeLengtsTable(new short[getMainTreeElements()]);
- setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
- } else
- throw new CancellationException(
- "window size should be more than zero");
- }
-
- protected void setWindow(int window) {
- this.window = window;
- }
-
- protected int getWindow() {
- return window;
- }
-
- protected void setWindowSize(long window_size) {
- this.window_size = window_size;
- }
-
- protected long getWindowSize() {
- return window_size;
- }
-
- protected void setWindowPosition(int window_position) {
- this.window_position = window_position;
- }
-
- protected int getWindowPosition() {
- return window_position;
- }
-
- protected void setMainTreeElements(int main_tree_elements) {
- this.main_tree_elements = main_tree_elements;
- }
-
- protected int getMainTreeElements() {
- return main_tree_elements;
- }
-
- protected void setHadStarted(LzxState hadStarted) {
- this.hadStarted = hadStarted;
- }
-
- protected LzxState getHadStarted() {
- return hadStarted;
- }
-
- protected void setBlockType(int block_type) {
- this.block_type = block_type;
- }
-
- public int getBlockType() {
- return block_type;
- }
-
- protected void setBlockLength(int block_length) {
- this.block_length = block_length;
- }
-
- protected int getBlockLength() {
- return block_length;
- }
-
- protected void setBlockRemaining(int block_remaining) {
- this.block_remaining = block_remaining;
- }
-
- protected int getBlockRemaining() {
- return block_remaining;
- }
-
- protected void setFramesRead(int frames_read) {
- this.frames_read = frames_read;
- }
-
- protected void increaseFramesRead() {
- this.frames_read = getFramesRead() + 1;
- }
-
- protected int getFramesRead() {
- return frames_read;
- }
-
- protected void setIntelFileSize(int intel_file_size) {
- this.intel_file_size = intel_file_size;
- }
-
- protected int getIntelFileSize() {
- return intel_file_size;
- }
-
- protected void setIntelCurrentPossition(long intel_current_possition) {
- this.intel_current_possition = intel_current_possition;
- }
-
- protected long getIntelCurrentPossition() {
- return intel_current_possition;
- }
-
- protected void setIntelState(IntelState intel_state) {
- this.intel_state = intel_state;
- }
-
- protected IntelState getIntelState() {
- return intel_state;
- }
-
- protected void setR0(long r0) {
- R0 = r0;
- }
-
- protected long getR0() {
- return R0;
- }
-
- protected void setR1(long r1) {
- R1 = r1;
- }
-
- protected long getR1() {
- return R1;
- }
-
- protected void setR2(long r2) {
- R2 = r2;
- }
-
- protected long getR2() {
- return R2;
- }
-
- public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
- this.mainTreeLengtsTable = mainTreeLengtsTable;
- }
-
- public short[] getMainTreeLengtsTable() {
- return mainTreeLengtsTable;
- }
-
- public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
- this.lengthTreeLengtsTable = lengthTreeLengtsTable;
- }
-
- public short[] getLengthTreeLengtsTable() {
- return lengthTreeLengtsTable;
- }
-
- private static short[] arrayClone(short[] a) {
- return a==null ? null : (short[]) a.clone();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+ /* Class' members */
+ private int window; /* the actual decoding window */
+ private long window_size; /* window size (32Kb through 2Mb) */
+ private int window_position; /* current offset within the window */
+ private int main_tree_elements; /* number of main tree elements */
+ private LzxState hadStarted; /* have we started decoding at all yet? */
+ private int block_type; /* type of this block */
+ private int block_length; /* uncompressed length of this block */
+ private int block_remaining; /* uncompressed bytes still left to decode */
+ private int frames_read; /* the number of CFDATA blocks processed */
+ private int intel_file_size; /* magic header value used for transform */
+ private long intel_current_possition; /* current offset in transform space */
+ private IntelState intel_state; /* have we seen any translatable data yet? */
+ private long R0; /* for the LRU offset system */
+ private long R1; /* for the LRU offset system */
+ private long R2; /* for the LRU offset system */
+
+ // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+ protected short[] mainTreeLengtsTable;
+ protected short[] mainTreeTable;
+
+ protected short[] lengthTreeTable;
+ protected short[] lengthTreeLengtsTable;
+
+ protected short[] alignedLenTable;
+ protected short[] alignedTreeTable;
+
+ @Override
+ public ChmLzxState clone() {
+ try {
+ ChmLzxState clone = (ChmLzxState)super.clone();
+ clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+ clone.mainTreeTable = arrayClone(mainTreeTable);
+ clone.lengthTreeTable = arrayClone(lengthTreeTable);
+ clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+ clone.alignedLenTable = arrayClone(alignedLenTable);
+ clone.alignedTreeTable = arrayClone(alignedTreeTable);
+ return clone;
+ } catch (CloneNotSupportedException ex) {
+ return null;
+ }
+ }
+
+ protected short[] getMainTreeTable() {
+ return mainTreeTable;
+ }
+
+ protected short[] getAlignedTreeTable() {
+ return alignedTreeTable;
+ }
+
+ protected void setAlignedTreeTable(short[] alignedTreeTable) {
+ this.alignedTreeTable = alignedTreeTable;
+ }
+
+ protected short[] getLengthTreeTable() throws TikaException {
+ if (lengthTreeTable != null)
+ return this.lengthTreeTable;
+ else
+ throw new ChmParsingException("lengthTreeTable is null");
+ }
+
+ protected void setLengthTreeTable(short[] lengthTreeTable) {
+ this.lengthTreeTable = lengthTreeTable;
+ }
+
+ protected void setMainTreeTable(short[] mainTreeTable) {
+ this.mainTreeTable = mainTreeTable;
+ }
+
+ protected short[] getAlignedLenTable() {
+ return this.alignedLenTable;
+ }
+
+ protected void setAlignedLenTable(short[] alignedLenTable) {
+ this.alignedLenTable = alignedLenTable;
+ }
+
+ /**
+ * It suits for informative outlook
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("actual decoding window:=" + getWindow()
+ + System.getProperty("line.separator"));
+ sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+ + System.getProperty("line.separator"));
+ sb.append("current offset within the window:=" + getWindowPosition()
+ + System.getProperty("line.separator"));
+ sb.append("number of main tree elements:=" + getMainTreeElements()
+ + System.getProperty("line.separator"));
+ sb.append("have we started decoding at all yet?:=" + getHadStarted()
+ + System.getProperty("line.separator"));
+ sb.append("type of this block:=" + getBlockType()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed length of this block:=" + getBlockLength()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed bytes still left to decode:="
+ + getBlockRemaining() + System.getProperty("line.separator"));
+ sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+ + System.getProperty("line.separator"));
+ sb.append("magic header value used for transform:="
+ + getIntelFileSize() + System.getProperty("line.separator"));
+ sb.append("current offset in transform space:="
+ + getIntelCurrentPossition()
+ + System.getProperty("line.separator"));
+ sb.append("have we seen any translatable data yet?:=" + getIntelState()
+ + System.getProperty("line.separator"));
+ sb.append("R0 for the LRU offset system:=" + getR0()
+ + System.getProperty("line.separator"));
+ sb.append("R1 for the LRU offset system:=" + getR1()
+ + System.getProperty("line.separator"));
+ sb.append("R2 for the LRU offset system:=" + getR2()
+ + System.getProperty("line.separator"));
+ sb.append("main tree length:=" + getMainTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ public ChmLzxState(int window) throws TikaException {
+ if (window >= 0) {
+ int position_slots;
+ int win = ChmCommons.getWindowSize(window);
+ setWindowSize(1 << win);
+ /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+ if (win < 15 || win > 21)
+ throw new ChmParsingException("window less than 15 or window greater than 21");
+
+ /* Calculates required position slots */
+ if (win == 20)
+ position_slots = 42;
+ else if (win == 21)
+ position_slots = 50;
+ else
+ position_slots = win << 1;
+ //TODO: position_slots is not used ?
+ setR0(1);
+ setR1(1);
+ setR2(1);
+ setMainTreeElements(512);
+ setHadStarted(LzxState.NOT_STARTED_DECODING);
+ setFramesRead(0);
+ setBlockRemaining(0);
+ setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+ setIntelCurrentPossition(0);
+ setIntelState(IntelState.NOT_STARTED);
+ setWindowPosition(0);
+ setMainTreeLengtsTable(new short[getMainTreeElements()]);
+ setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+ } else
+ throw new CancellationException(
+ "window size should be more than zero");
+ }
+
+ protected void setWindow(int window) {
+ this.window = window;
+ }
+
+ protected int getWindow() {
+ return window;
+ }
+
+ protected void setWindowSize(long window_size) {
+ this.window_size = window_size;
+ }
+
+ protected long getWindowSize() {
+ return window_size;
+ }
+
+ protected void setWindowPosition(int window_position) {
+ this.window_position = window_position;
+ }
+
+ protected int getWindowPosition() {
+ return window_position;
+ }
+
+ protected void setMainTreeElements(int main_tree_elements) {
+ this.main_tree_elements = main_tree_elements;
+ }
+
+ protected int getMainTreeElements() {
+ return main_tree_elements;
+ }
+
+ protected void setHadStarted(LzxState hadStarted) {
+ this.hadStarted = hadStarted;
+ }
+
+ protected LzxState getHadStarted() {
+ return hadStarted;
+ }
+
+ protected void setBlockType(int block_type) {
+ this.block_type = block_type;
+ }
+
+ public int getBlockType() {
+ return block_type;
+ }
+
+ protected void setBlockLength(int block_length) {
+ this.block_length = block_length;
+ }
+
+ protected int getBlockLength() {
+ return block_length;
+ }
+
+ protected void setBlockRemaining(int block_remaining) {
+ this.block_remaining = block_remaining;
+ }
+
+ protected int getBlockRemaining() {
+ return block_remaining;
+ }
+
+ protected void setFramesRead(int frames_read) {
+ this.frames_read = frames_read;
+ }
+
+ protected void increaseFramesRead() {
+ this.frames_read = getFramesRead() + 1;
+ }
+
+ protected int getFramesRead() {
+ return frames_read;
+ }
+
+ protected void setIntelFileSize(int intel_file_size) {
+ this.intel_file_size = intel_file_size;
+ }
+
+ protected int getIntelFileSize() {
+ return intel_file_size;
+ }
+
+ protected void setIntelCurrentPossition(long intel_current_possition) {
+ this.intel_current_possition = intel_current_possition;
+ }
+
+ protected long getIntelCurrentPossition() {
+ return intel_current_possition;
+ }
+
+ protected void setIntelState(IntelState intel_state) {
+ this.intel_state = intel_state;
+ }
+
+ protected IntelState getIntelState() {
+ return intel_state;
+ }
+
+ protected void setR0(long r0) {
+ R0 = r0;
+ }
+
+ protected long getR0() {
+ return R0;
+ }
+
+ protected void setR1(long r1) {
+ R1 = r1;
+ }
+
+ protected long getR1() {
+ return R1;
+ }
+
+ protected void setR2(long r2) {
+ R2 = r2;
+ }
+
+ protected long getR2() {
+ return R2;
+ }
+
+ public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+ this.mainTreeLengtsTable = mainTreeLengtsTable;
+ }
+
+ public short[] getMainTreeLengtsTable() {
+ return mainTreeLengtsTable;
+ }
+
+ public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+ this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+ }
+
+ public short[] getLengthTreeLengtsTable() {
+ return lengthTreeLengtsTable;
+ }
+
+ private static short[] arrayClone(short[] a) {
+ return a==null ? null : (short[]) a.clone();
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
index c8944be..77f9b3a 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
@@ -1,222 +1,222 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-
-public class ChmSection {
- final private byte[] data;
- final private byte[] prevcontent;
- private int swath;// kiks
- private int total;// remains
- private int buffer;// val
-
- public ChmSection(byte[] data) throws TikaException {
- this(data, null);
- }
-
- public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
- ChmCommons.assertByteArrayNotNull(data);
- this.data = data;
- this.prevcontent = prevconent;
- //setData(data);
- }
-
- /* Utilities */
- public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
- ChmCommons.assertByteArrayNotNull(toBeReversed);
- ChmCommons.reverse(toBeReversed);
- return toBeReversed;
- }
-
- public int checkBit(int i) {
- return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
- }
-
- public int getSyncBits(int bit) {
- return getDesyncBits(bit, bit);
- }
-
- public int peekBits(int bit) {
- return getDesyncBits(bit, 0);
- }
-
- private int getDesyncBits(int bit, int removeBit) {
- while (getTotal() < 16) {
- setBuffer((getBuffer() << 16) + unmarshalUByte()
- + (unmarshalUByte() << 8));
- setTotal(getTotal() + 16);
- }
- int tmp = (getBuffer() >>> (getTotal() - bit));
- setTotal(getTotal() - removeBit);
- setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
- return tmp;
- }
-
- public int unmarshalUByte() {
- return getByte() & 255;
- }
-
- public byte getByte() {
- if (getSwath() < getData().length) {
- setSwath(getSwath() + 1);
- return getData()[getSwath() - 1];
- } else
- return 0;
- }
-
- public int getLeft() {
- return (getData().length - getSwath());
- }
-
- public byte[] getData() {
- return data;
- }
-
- public byte[] getPrevContent() {
- return prevcontent;
- }
-
- public BigInteger getBigInteger(int i) {
- if (getData() == null)
- return BigInteger.ZERO;
- if (getData().length - getSwath() < i)
- i = getData().length - getSwath();
- byte[] tmp = new byte[i];
- for (int j = i - 1; j >= 0; j--) {
- tmp[i - j - 1] = getData()[getSwath() + j];
- }
- setSwath(getSwath() + i);
- return new BigInteger(tmp);
- }
-
- public byte[] stringToAsciiBytes(String s) {
- char[] c = s.toCharArray();
- byte[] byteval = new byte[c.length];
- for (int i = 0; i < c.length; i++)
- byteval[i] = (byte) c[i];
- return byteval;
- }
-
- public BigInteger unmarshalUlong() {
- return getBigInteger(8);
- }
-
- public long unmarshalUInt() {
- return getBigInteger(4).longValue();
- }
-
- public int unmarshalInt() {
- return getBigInteger(4).intValue();
- }
-
- public byte[] unmarshalBytes(int i) {
- if (i == 0)
- return new byte[1];
- byte[] t = new byte[i];
- for (int j = 0; j < i; j++)
- t[j] = getData()[j + getSwath()];
- setSwath(getSwath() + i);
- return t;
- }
-
- public BigInteger getEncint() {
- byte ob;
- BigInteger bi = BigInteger.ZERO;
- byte[] nb = new byte[1];
- while ((ob = this.getByte()) < 0) {
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- }
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- return bi;
- }
-
- public char unmarshalUtfChar() {
- byte ob;
- int i = 1;
- byte[] ba;
- ob = this.getByte();
- if (ob < 0) {
- i = 2;
- while ((ob << (24 + i)) < 0)
- i++;
- }
- ba = new byte[i];
- ba[0] = ob;
- int j = 1;
- while (j < i) {
- ba[j] = this.getByte();
- j++;
- }
- i = ba.length;
- if (i == 1)
- return (char) ba[0];
- else {
- int n;
- n = ba[0] & 15; // 00001111b, gets last 4 bits
- j = 1;
- while (j < i)
- n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
- return (char) n;
- }
- }
-
-// private void setData(byte[] data) {
-// this.data = data;
-// }
-
- public int getSwath() {
- return swath;
- }
-
- public void setSwath(int swath) {
- this.swath = swath;
- }
-
- public int getTotal() {
- return total;
- }
-
- public void setTotal(int total) {
- this.total = total;
- }
-
- private int getBuffer() {
- return buffer;
- }
-
- private void setBuffer(int buffer) {
- this.buffer = buffer;
- }
-
- /**
- * @param args
- * @throws TikaException
- */
- public static void main(String[] args) throws TikaException {
- byte[] array = { 4, 78, -67, 90, 1, -33 };
- ChmSection chmSection = new ChmSection(array);
- System.out.println("before " + Arrays.toString(array));
- System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+ final private byte[] data;
+ final private byte[] prevcontent;
+ private int swath;// kiks
+ private int total;// remains
+ private int buffer;// val
+
+ public ChmSection(byte[] data) throws TikaException {
+ this(data, null);
+ }
+
+ public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(data);
+ this.data = data;
+ this.prevcontent = prevconent;
+ //setData(data);
+ }
+
+ /* Utilities */
+ public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(toBeReversed);
+ ChmCommons.reverse(toBeReversed);
+ return toBeReversed;
+ }
+
+ public int checkBit(int i) {
+ return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+ }
+
+ public int getSyncBits(int bit) {
+ return getDesyncBits(bit, bit);
+ }
+
+ public int peekBits(int bit) {
+ return getDesyncBits(bit, 0);
+ }
+
+ private int getDesyncBits(int bit, int removeBit) {
+ while (getTotal() < 16) {
+ setBuffer((getBuffer() << 16) + unmarshalUByte()
+ + (unmarshalUByte() << 8));
+ setTotal(getTotal() + 16);
+ }
+ int tmp = (getBuffer() >>> (getTotal() - bit));
+ setTotal(getTotal() - removeBit);
+ setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+ return tmp;
+ }
+
+ public int unmarshalUByte() {
+ return getByte() & 255;
+ }
+
+ public byte getByte() {
+ if (getSwath() < getData().length) {
+ setSwath(getSwath() + 1);
+ return getData()[getSwath() - 1];
+ } else
+ return 0;
+ }
+
+ public int getLeft() {
+ return (getData().length - getSwath());
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ public byte[] getPrevContent() {
+ return prevcontent;
+ }
+
+ public BigInteger getBigInteger(int i) {
+ if (getData() == null)
+ return BigInteger.ZERO;
+ if (getData().length - getSwath() < i)
+ i = getData().length - getSwath();
+ byte[] tmp = new byte[i];
+ for (int j = i - 1; j >= 0; j--) {
+ tmp[i - j - 1] = getData()[getSwath() + j];
+ }
+ setSwath(getSwath() + i);
+ return new BigInteger(tmp);
+ }
+
+ public byte[] stringToAsciiBytes(String s) {
+ char[] c = s.toCharArray();
+ byte[] byteval = new byte[c.length];
+ for (int i = 0; i < c.length; i++)
+ byteval[i] = (byte) c[i];
+ return byteval;
+ }
+
+ public BigInteger unmarshalUlong() {
+ return getBigInteger(8);
+ }
+
+ public long unmarshalUInt() {
+ return getBigInteger(4).longValue();
+ }
+
+ public int unmarshalInt() {
+ return getBigInteger(4).intValue();
+ }
+
+ public byte[] unmarshalBytes(int i) {
+ if (i == 0)
+ return new byte[1];
+ byte[] t = new byte[i];
+ for (int j = 0; j < i; j++)
+ t[j] = getData()[j + getSwath()];
+ setSwath(getSwath() + i);
+ return t;
+ }
+
+ public BigInteger getEncint() {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+ while ((ob = this.getByte()) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ return bi;
+ }
+
+ public char unmarshalUtfChar() {
+ byte ob;
+ int i = 1;
+ byte[] ba;
+ ob = this.getByte();
+ if (ob < 0) {
+ i = 2;
+ while ((ob << (24 + i)) < 0)
+ i++;
+ }
+ ba = new byte[i];
+ ba[0] = ob;
+ int j = 1;
+ while (j < i) {
+ ba[j] = this.getByte();
+ j++;
+ }
+ i = ba.length;
+ if (i == 1)
+ return (char) ba[0];
+ else {
+ int n;
+ n = ba[0] & 15; // 00001111b, gets last 4 bits
+ j = 1;
+ while (j < i)
+ n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+ return (char) n;
+ }
+ }
+
+// private void setData(byte[] data) {
+// this.data = data;
+// }
+
+ public int getSwath() {
+ return swath;
+ }
+
+ public void setSwath(int swath) {
+ this.swath = swath;
+ }
+
+ public int getTotal() {
+ return total;
+ }
+
+ public void setTotal(int total) {
+ this.total = total;
+ }
+
+ private int getBuffer() {
+ return buffer;
+ }
+
+ private void setBuffer(int buffer) {
+ this.buffer = buffer;
+ }
+
+ /**
+ * @param args
+ * @throws TikaException
+ */
+ public static void main(String[] args) throws TikaException {
+ byte[] array = { 4, 78, -67, 90, 1, -33 };
+ ChmSection chmSection = new ChmSection(array);
+ System.out.println("before " + Arrays.toString(array));
+ System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 0e0e3da..86b1dd4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -1,209 +1,209 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
- * DelegatingParser to process each mail.
- */
-public class MboxParser extends AbstractParser {
-
- public static final String MBOX_MIME_TYPE = "application/mbox";
- public static final String MBOX_RECORD_DIVIDER = "From ";
- public static final int MAIL_MAX_SIZE = 50000000;
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -1762689436731160661L;
- private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
- private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
- private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
-
- private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
- private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
- private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
- private boolean tracking = false;
-
- public static Date parseDate(String headerContent) throws ParseException {
- SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
- return dateFormat.parse(headerContent);
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, TikaException, SAXException {
-
- EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
-
- String charsetName = "windows-1252";
-
- metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
- metadata.set(Metadata.CONTENT_ENCODING, charsetName);
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- InputStreamReader isr = new InputStreamReader(stream, charsetName);
- try (BufferedReader reader = new BufferedReader(isr)) {
- String curLine = reader.readLine();
- int mailItem = 0;
- do {
- if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
- Metadata mailMetadata = new Metadata();
- Queue<String> multiline = new LinkedList<String>();
- mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
- mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
- curLine = reader.readLine();
-
- ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
- do {
- if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
- String latestLine = multiline.poll();
- latestLine += " " + curLine.trim();
- multiline.add(latestLine);
- } else {
- multiline.add(curLine);
- }
-
- message.write(curLine.getBytes(charsetName));
- message.write(0x0A);
- curLine = reader.readLine();
- }
- while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
-
- for (String item : multiline) {
- saveHeaderInMetadata(mailMetadata, item);
- }
-
- ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
- message = null;
-
- if (extractor.shouldParseEmbedded(mailMetadata)) {
- extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
- }
-
- if (tracking) {
- getTrackingMetadata().put(mailItem++, mailMetadata);
- }
- } else {
- curLine = reader.readLine();
- }
-
- } while (curLine != null && !Thread.currentThread().isInterrupted());
- }
-
- xhtml.endDocument();
- }
-
- public boolean isTracking() {
- return tracking;
- }
-
- public void setTracking(boolean tracking) {
- this.tracking = tracking;
- }
-
- public Map<Integer, Metadata> getTrackingMetadata() {
- return trackingMetadata;
- }
-
- private void saveHeaderInMetadata(Metadata metadata, String curLine) {
- Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
- if (!headerMatcher.matches()) {
- return; // ignore malformed header lines
- }
-
- String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
- String headerContent = headerMatcher.group(2);
-
- if (headerTag.equalsIgnoreCase("From")) {
- metadata.set(TikaCoreProperties.CREATOR, headerContent);
- } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
- || headerTag.equalsIgnoreCase("Bcc")) {
- Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
- if (address.find()) {
- metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
- } else if (headerContent.indexOf('@') > -1) {
- metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
- }
-
- String property = Metadata.MESSAGE_TO;
- if (headerTag.equalsIgnoreCase("Cc")) {
- property = Metadata.MESSAGE_CC;
- } else if (headerTag.equalsIgnoreCase("Bcc")) {
- property = Metadata.MESSAGE_BCC;
- }
- metadata.add(property, headerContent);
- } else if (headerTag.equalsIgnoreCase("Subject")) {
- metadata.add(Metadata.SUBJECT, headerContent);
- } else if (headerTag.equalsIgnoreCase("Date")) {
- try {
- Date date = parseDate(headerContent);
- metadata.set(TikaCoreProperties.CREATED, date);
- } catch (ParseException e) {
- // ignoring date because format was not understood
- }
- } else if (headerTag.equalsIgnoreCase("Message-Id")) {
- metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
- } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
- metadata.set(TikaCoreProperties.RELATION, headerContent);
- } else if (headerTag.equalsIgnoreCase("Content-Type")) {
- // TODO - key off content-type in headers to
- // set mapping to use for content and convert if necessary.
-
- metadata.add(Metadata.CONTENT_TYPE, headerContent);
- metadata.set(TikaCoreProperties.FORMAT, headerContent);
- } else {
- metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ public static final int MAIL_MAX_SIZE = 50000000;
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1762689436731160661L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+ private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+ private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+ private boolean tracking = false;
+
+ public static Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ String charsetName = "windows-1252";
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ InputStreamReader isr = new InputStreamReader(stream, charsetName);
+ try (BufferedReader reader = new BufferedReader(isr)) {
+ String curLine = reader.readLine();
+ int mailItem = 0;
+ do {
+ if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ Metadata mailMetadata = new Metadata();
+ Queue<String> multiline = new LinkedList<String>();
+ mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ curLine = reader.readLine();
+
+ ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+ do {
+ if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
+
+ message.write(curLine.getBytes(charsetName));
+ message.write(0x0A);
+ curLine = reader.readLine();
+ }
+ while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+ for (String item : multiline) {
+ saveHeaderInMetadata(mailMetadata, item);
+ }
+
+ ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+ message = null;
+
+ if (extractor.shouldParseEmbedded(mailMetadata)) {
+ extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+ }
+
+ if (tracking) {
+ getTrackingMetadata().put(mailItem++, mailMetadata);
+ }
+ } else {
+ curLine = reader.readLine();
+ }
+
+ } while (curLine != null && !Thread.currentThread().isInterrupted());
+ }
+
+ xhtml.endDocument();
+ }
+
+ public boolean isTracking() {
+ return tracking;
+ }
+
+ public void setTracking(boolean tracking) {
+ this.tracking = tracking;
+ }
+
+ public Map<Integer, Metadata> getTrackingMetadata() {
+ return trackingMetadata;
+ }
+
+ private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ return; // ignore malformed header lines
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.set(TikaCoreProperties.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+ || headerTag.equalsIgnoreCase("Bcc")) {
+ Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+ if (address.find()) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+ } else if (headerContent.indexOf('@') > -1) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+ }
+
+ String property = Metadata.MESSAGE_TO;
+ if (headerTag.equalsIgnoreCase("Cc")) {
+ property = Metadata.MESSAGE_CC;
+ } else if (headerTag.equalsIgnoreCase("Bcc")) {
+ property = Metadata.MESSAGE_BCC;
+ }
+ metadata.add(property, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ try {
+ Date date = parseDate(headerContent);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.set(TikaCoreProperties.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.set(TikaCoreProperties.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index f7eec91..5883bd5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -1,203 +1,203 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mbox;
-
-import static java.lang.String.valueOf;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.Collections.singleton;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Set;
-
-import com.pff.PSTAttachment;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Parser for MS Outlook PST email storage files
- */
-public class OutlookPSTParser extends AbstractParser {
-
- private static final long serialVersionUID = 620998217748364063L;
-
- public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
- private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
-
- private static AttributesImpl createAttribute(String attName, String attValue) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", attName, attName, "CDATA", attValue);
- return attributes;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- // Use the delegate parser to parse the contained document
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
-
- metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- TikaInputStream in = TikaInputStream.get(stream);
- PSTFile pstFile = null;
- try {
- pstFile = new PSTFile(in.getFile().getPath());
- metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
- boolean isValid = pstFile.getFileHandle().getFD().valid();
- metadata.set("isValid", valueOf(isValid));
- if (isValid) {
- parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
- }
- } catch (Exception e) {
- throw new TikaException(e.getMessage(), e);
- } finally {
- if (pstFile != null && pstFile.getFileHandle() != null) {
- try {
- pstFile.getFileHandle().close();
- } catch (IOException e) {
- //swallow closing exception
- }
- }
- }
-
- xhtml.endDocument();
- }
-
- private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
- throws Exception {
- if (pstFolder.getContentCount() > 0) {
- PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
- while (pstMail != null) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
- handler.startElement("div", attributes);
- handler.element("h1", pstMail.getSubject());
-
- parserMailItem(handler, pstMail, embeddedExtractor);
- parseMailAttachments(handler, pstMail, embeddedExtractor);
-
- handler.endElement("div");
-
- pstMail = (PSTMessage) pstFolder.getNextChild();
- }
- }
-
- if (pstFolder.hasSubfolders()) {
- for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
- handler.startElement("div", createAttribute("class", "email-folder"));
- handler.element("h1", pstSubFolder.getDisplayName());
- parseFolder(handler, pstSubFolder, embeddedExtractor);
- handler.endElement("div");
- }
- }
- }
-
- private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
- Metadata mailMetadata = new Metadata();
- mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
- mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
- mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
- mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
- mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
- mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
- mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
- mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
- mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
- mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
- mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
- mailMetadata.set("recipients", pstMail.getRecipientsString());
- mailMetadata.set("displayTo", pstMail.getDisplayTo());
- mailMetadata.set("displayCC", pstMail.getDisplayCC());
- mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
- mailMetadata.set("importance", valueOf(pstMail.getImportance()));
- mailMetadata.set("priority", valueOf(pstMail.getPriority()));
- mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
-
- byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
- embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
- }
-
- private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
- throws TikaException {
- int numberOfAttachments = email.getNumberOfAttachments();
- for (int i = 0; i < numberOfAttachments; i++) {
- File tempFile = null;
- try {
- PSTAttachment attach = email.getAttachment(i);
-
- // Get the filename; both long and short filenames can be used for attachments
- String filename = attach.getLongFilename();
- if (filename.isEmpty()) {
- filename = attach.getFilename();
- }
-
- xhtml.element("p", filename);
-
- Metadata attachMeta = new Metadata();
- attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
- attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", filename);
- xhtml.startElement("div", attributes);
- if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
- embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
- } finally {
- tmp.dispose();
- }
- }
- xhtml.endElement("div");
-
- } catch (Exception e) {
- throw new TikaException("Unable to unpack document stream", e);
- } finally {
- if (tempFile != null)
- tempFile.delete();
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+ private static final long serialVersionUID = 620998217748364063L;
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+ private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static AttributesImpl createAttribute(String attName, String attValue) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", attName, attName, "CDATA", attValue);
+ return attributes;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TikaInputStream in = TikaInputStream.get(stream);
+ PSTFile pstFile = null;
+ try {
+ pstFile = new PSTFile(in.getFile().getPath());
+ metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+ boolean isValid = pstFile.getFileHandle().getFD().valid();
+ metadata.set("isValid", valueOf(isValid));
+ if (isValid) {
+ parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+ }
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage(), e);
+ } finally {
+ if (pstFile != null && pstFile.getFileHandle() != null) {
+ try {
+ pstFile.getFileHandle().close();
+ } catch (IOException e) {
+ //swallow closing exception
+ }
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+ throws Exception {
+ if (pstFolder.getContentCount() > 0) {
+ PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+ while (pstMail != null) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ parserMailItem(handler, pstMail, embeddedExtractor);
+ parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+ handler.endElement("div");
+
+ pstMail = (PSTMessage) pstFolder.getNextChild();
+ }
+ }
+
+ if (pstFolder.hasSubfolders()) {
+ for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+ handler.startElement("div", createAttribute("class", "email-folder"));
+ handler.element("h1", pstSubFolder.getDisplayName());
+ parseFolder(handler, pstSubFolder, embeddedExtractor);
+ handler.endElement("div");
+ }
+ }
+ }
+
+ private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+ Metadata mailMetadata = new Metadata();
+ mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+ mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+ mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+ mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+ mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+ mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+ mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+ mailMetadata.set("recipients", pstMail.getRecipientsString());
+ mailMetadata.set("displayTo", pstMail.getDisplayTo());
+ mailMetadata.set("displayCC", pstMail.getDisplayCC());
+ mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+ mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+ mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+ mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+ }
+
+ private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+ throws TikaException {
+ int numberOfAttachments = email.getNumberOfAttachments();
+ for (int i = 0; i < numberOfAttachments; i++) {
+ File tempFile = null;
+ try {
+ PSTAttachment attach = email.getAttachment(i);
+
+ // Get the filename; both long and short filenames can be used for attachments
+ String filename = attach.getLongFilename();
+ if (filename.isEmpty()) {
+ filename = attach.getFilename();
+ }
+
+ xhtml.element("p", filename);
+
+ Metadata attachMeta = new Metadata();
+ attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+ attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", filename);
+ xhtml.startElement("div", attributes);
+ if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ xhtml.endElement("div");
+
+ } catch (Exception e) {
+ throw new TikaException("Unable to unpack document stream", e);
+ } finally {
+ if (tempFile != null)
+ tempFile.delete();
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
index 36439b8..fa932a6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
@@ -1,99 +1,99 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that:<ul>
- * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
- * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
- * </ul>
- */
-public class NSNormalizerContentHandler extends ContentHandlerDecorator {
-
- private static final String OLD_NS =
- "http://openoffice.org/2000/";
-
- private static final String NEW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:";
-
- private static final String DTD_PUBLIC_ID =
- "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
-
- public NSNormalizerContentHandler(ContentHandler handler) {
- super(handler);
- }
-
- private String mapOldNS(String ns) {
- if (ns != null && ns.startsWith(OLD_NS)) {
- return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
- } else {
- return ns;
- }
- }
-
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes atts) throws SAXException {
- AttributesImpl natts = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- natts.addAttribute(
- mapOldNS(atts.getURI(i)), atts.getLocalName(i),
- atts.getQName(i), atts.getType(i), atts.getValue(i));
- }
- super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
- }
-
- @Override
- public void endElement(String namespaceURI, String localName, String qName)
- throws SAXException {
- super.endElement(mapOldNS(namespaceURI), localName, qName);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri)
- throws SAXException {
- super.startPrefixMapping(prefix, mapOldNS(uri));
- }
-
- /**
- * do not load any DTDs (may be requested by parser). Fake the DTD by
- * returning a empty string as InputSource
- */
- @Override
- public InputSource resolveEntity(String publicId, String systemId)
- throws IOException, SAXException {
- if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
- || DTD_PUBLIC_ID.equals(publicId)) {
- return new InputSource(new StringReader(""));
- } else {
- return super.resolveEntity(publicId, systemId);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+ private static final String OLD_NS =
+ "http://openoffice.org/2000/";
+
+ private static final String NEW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:";
+
+ private static final String DTD_PUBLIC_ID =
+ "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private String mapOldNS(String ns) {
+ if (ns != null && ns.startsWith(OLD_NS)) {
+ return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+ } else {
+ return ns;
+ }
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+ atts.getQName(i), atts.getType(i), atts.getValue(i));
+ }
+ super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ super.endElement(mapOldNS(namespaceURI), localName, qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ super.startPrefixMapping(prefix, mapOldNS(uri));
+ }
+
+ /**
+ * do not load any DTDs (may be requested by parser). Fake the DTD by
+ * returning a empty string as InputSource
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId)
+ throws IOException, SAXException {
+ if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+ || DTD_PUBLIC_ID.equals(publicId)) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId, systemId);
+ }
+ }
+
+}
[29/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
index 4105dfa..1c615f6 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
@@ -1,414 +1,414 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing mp3 files.
- */
-public class Mp3ParserTest {
-
- /**
- * Checks the duration of an MP3 file.
- * @param metadata the metadata object
- * @param expected the expected duration, rounded as seconds
- */
- private static void checkDuration(Metadata metadata, int expected) {
- assertEquals("Wrong duration", expected,
- Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
- }
-
- /**
- * Test that with only ID3v1 tags, we get some information out
- */
- @Test
- public void testMp3ParsingID3v1() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3id3v1.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Test Title", content);
- assertContains("Test Artist", content);
- assertContains("Test Album", content);
- assertContains("2008", content);
- assertContains("Test Comment", content);
- assertContains("Rock", content);
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
- checkDuration(metadata, 2);
- }
-
- /**
- * Test that with only ID3v2 tags, we get the full
- * set of information out.
- */
- @Test
- public void testMp3ParsingID3v2() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3id3v2.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // Check core properties
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
- // Check the textual contents
- String content = handler.toString();
- assertContains("Test Title", content);
- assertContains("Test Artist", content);
- assertContains("Test Album", content);
- assertContains("2008", content);
- assertContains("Test Comment", content);
- assertContains("Rock", content);
- assertContains(", track 1", content);
- assertContains(", disc 1", content);
-
- // Check un-typed audio properties
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
-
- // Check XMPDM-typed audio properties
- assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
- assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
- assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
- assertEquals(null, metadata.get(XMPDM.COMPOSER));
- assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
- assertEquals("Rock", metadata.get(XMPDM.GENRE));
- assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
- assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
- assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
- assertEquals("1", metadata.get(XMPDM.COMPILATION));
-
- assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
- assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
- assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
- checkDuration(metadata, 2);
- }
-
- /**
- * Test that with both id3v2 and id3v1, we prefer the
- * details from id3v2
- */
- @Test
- public void testMp3ParsingID3v1v2() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3id3v1_v2.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Test Title", content);
- assertContains("Test Artist", content);
- assertContains("Test Album", content);
- assertContains("2008", content);
- assertContains("Test Comment", content);
- assertContains("Rock", content);
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
- checkDuration(metadata, 2);
- }
-
- /**
- * Test that with only ID3v2 tags, of version 2.4, we get the full
- * set of information out.
- */
- @Test
- public void testMp3ParsingID3v24() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3id3v24.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Test Title", content);
- assertContains("Test Artist", content);
- assertContains("Test Album", content);
- assertContains("2008", content);
- assertContains("Test Comment", content);
- assertContains("Rock", content);
- assertContains(", disc 1", content);
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
- checkDuration(metadata, 2);
-
- // Check XMPDM-typed audio properties
- assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
- assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
- assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
- assertEquals(null, metadata.get(XMPDM.COMPOSER));
- assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
- assertEquals("Rock", metadata.get(XMPDM.GENRE));
- assertEquals("1", metadata.get(XMPDM.COMPILATION));
-
- assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
- assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
- }
-
- /**
- * Tests that a file with characters not in the ISO 8859-1
- * range is correctly handled
- */
- @Test
- public void testMp3ParsingID3i18n() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3i18n.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist \u2468\u2460", metadata.get(Metadata.AUTHOR));
- assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
- assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
-
- assertEquals(
- "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
- metadata.get(XMPDM.LOG_COMMENT)
- );
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
- checkDuration(metadata, 2);
- }
-
-
- /**
- * Tests that a file with both lyrics and
- * ID3v2 tags gets both extracted correctly
- */
- @Test
- public void testMp3ParsingLyrics() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- // Note - our test file has a lyrics tag, but lacks any
- // lyrics in the tags, so we can't test that bit
- // TODO Find a better sample file
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3lyrics.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Test Title", content);
- assertContains("Test Artist", content);
- assertContains("Test Album", content);
- assertContains("2008", content);
- assertContains("Test Comment", content);
- assertContains("Rock", content);
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("2", metadata.get("channels"));
- checkDuration(metadata, 1);
- }
-
- @Test
- public void testID3v2Frame() throws Exception {
- byte[] empty = new byte[] {
- 0x49, 0x44, 0x33, 3, 1, 0,
- 0, 0, 0, 0
- };
-
- assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
- assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
-
- ID3v2Frame f = (ID3v2Frame)
- ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
- assertEquals(3, f.getMajorVersion());
- assertEquals(1, f.getMinorVersion());
- assertEquals(0, f.getFlags());
- assertEquals(0, f.getLength());
- assertEquals(0, f.getData().length);
-
- assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
- assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
- assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
- }
-
- @Test
- public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3noid3.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("2455.510986328125", metadata.get(XMPDM.DURATION));
- }
-
- /**
- * This test will do nothing, unless you've downloaded the
- * mp3 file from TIKA-424 - the file cannot be
- * distributed with Tika.
- * This test will check for the complicated set of ID3v2.4
- * tags.
- */
- @Test
- public void testTIKA424() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/test2.mp3")) {
- if (stream == null) {
- // You haven't downloaded the file
- // Skip the test
- return;
- }
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Plus loin vers l'ouest", content);
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("2", metadata.get("channels"));
- }
-
- /**
- * This tests that we can handle without errors (but perhaps not
- * all content) a file with a very very large ID3 frame that
- * has been truncated before the end of the ID3 tags.
- * In this case, it is a file with JPEG data in the ID3, which
- * is trunacted before the end of the JPEG bit of the ID3 frame.
- */
- @Test
- public void testTIKA474() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3truncated.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // Check we could get the headers from the start
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("Girl you have no faith in medicine", content);
- assertContains("The White Stripes", content);
- assertContains("Elephant", content);
- assertContains("2003", content);
-
- // File lacks any audio frames, so we can't know these
- assertEquals(null, metadata.get("version"));
- assertEquals(null, metadata.get("samplerate"));
- assertEquals(null, metadata.get("channels"));
- }
-
- // TIKA-1024
- @Test
- public void testNakedUTF16BOM() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testNakedUTF16BOM.mp3")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", metadata.get(XMPDM.GENRE));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing mp3 files.
+ */
+public class Mp3ParserTest {
+
+ /**
+ * Checks the duration of an MP3 file.
+ * @param metadata the metadata object
+ * @param expected the expected duration, rounded as seconds
+ */
+ private static void checkDuration(Metadata metadata, int expected) {
+ assertEquals("Wrong duration", expected,
+ Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
+ }
+
+ /**
+ * Test that with only ID3v1 tags, we get some information out
+ */
+ @Test
+ public void testMp3ParsingID3v1() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v1.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Test Title", content);
+ assertContains("Test Artist", content);
+ assertContains("Test Album", content);
+ assertContains("2008", content);
+ assertContains("Test Comment", content);
+ assertContains("Rock", content);
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+ checkDuration(metadata, 2);
+ }
+
+ /**
+ * Test that with only ID3v2 tags, we get the full
+ * set of information out.
+ */
+ @Test
+ public void testMp3ParsingID3v2() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v2.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Check core properties
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ // Check the textual contents
+ String content = handler.toString();
+ assertContains("Test Title", content);
+ assertContains("Test Artist", content);
+ assertContains("Test Album", content);
+ assertContains("2008", content);
+ assertContains("Test Comment", content);
+ assertContains("Rock", content);
+ assertContains(", track 1", content);
+ assertContains(", disc 1", content);
+
+ // Check un-typed audio properties
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+
+ // Check XMPDM-typed audio properties
+ assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+ assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+ assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+ assertEquals(null, metadata.get(XMPDM.COMPOSER));
+ assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+ assertEquals("Rock", metadata.get(XMPDM.GENRE));
+ assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
+ assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
+ assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
+ assertEquals("1", metadata.get(XMPDM.COMPILATION));
+
+ assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+ assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+ assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
+ checkDuration(metadata, 2);
+ }
+
+ /**
+ * Test that with both id3v2 and id3v1, we prefer the
+ * details from id3v2
+ */
+ @Test
+ public void testMp3ParsingID3v1v2() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v1_v2.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Test Title", content);
+ assertContains("Test Artist", content);
+ assertContains("Test Album", content);
+ assertContains("2008", content);
+ assertContains("Test Comment", content);
+ assertContains("Rock", content);
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+ checkDuration(metadata, 2);
+ }
+
+ /**
+ * Test that with only ID3v2 tags, of version 2.4, we get the full
+ * set of information out.
+ */
+ @Test
+ public void testMp3ParsingID3v24() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v24.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Test Title", content);
+ assertContains("Test Artist", content);
+ assertContains("Test Album", content);
+ assertContains("2008", content);
+ assertContains("Test Comment", content);
+ assertContains("Rock", content);
+ assertContains(", disc 1", content);
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+ checkDuration(metadata, 2);
+
+ // Check XMPDM-typed audio properties
+ assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+ assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+ assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+ assertEquals(null, metadata.get(XMPDM.COMPOSER));
+ assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+ assertEquals("Rock", metadata.get(XMPDM.GENRE));
+ assertEquals("1", metadata.get(XMPDM.COMPILATION));
+
+ assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
+ assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
+ }
+
+ /**
+ * Tests that a file with characters not in the ISO 8859-1
+ * range is correctly handled
+ */
+ @Test
+ public void testMp3ParsingID3i18n() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3i18n.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist \u2468\u2460", metadata.get(Metadata.AUTHOR));
+ assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+ assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+ assertEquals(
+ "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
+ metadata.get(XMPDM.LOG_COMMENT)
+ );
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+ checkDuration(metadata, 2);
+ }
+
+
+ /**
+ * Tests that a file with both lyrics and
+ * ID3v2 tags gets both extracted correctly
+ */
+ @Test
+ public void testMp3ParsingLyrics() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Note - our test file has a lyrics tag, but lacks any
+ // lyrics in the tags, so we can't test that bit
+ // TODO Find a better sample file
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3lyrics.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Test Title", content);
+ assertContains("Test Artist", content);
+ assertContains("Test Album", content);
+ assertContains("2008", content);
+ assertContains("Test Comment", content);
+ assertContains("Rock", content);
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ checkDuration(metadata, 1);
+ }
+
+ @Test
+ public void testID3v2Frame() throws Exception {
+ byte[] empty = new byte[] {
+ 0x49, 0x44, 0x33, 3, 1, 0,
+ 0, 0, 0, 0
+ };
+
+ assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
+ assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
+
+ ID3v2Frame f = (ID3v2Frame)
+ ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+ assertEquals(3, f.getMajorVersion());
+ assertEquals(1, f.getMinorVersion());
+ assertEquals(0, f.getFlags());
+ assertEquals(0, f.getLength());
+ assertEquals(0, f.getData().length);
+
+ assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
+ assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
+ assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
+ }
+
+ @Test
+ public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3noid3.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("2455.510986328125", metadata.get(XMPDM.DURATION));
+ }
+
+ /**
+ * This test will do nothing, unless you've downloaded the
+ * mp3 file from TIKA-424 - the file cannot be
+ * distributed with Tika.
+ * This test will check for the complicated set of ID3v2.4
+ * tags.
+ */
+ @Test
+ public void testTIKA424() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/test2.mp3")) {
+ if (stream == null) {
+ // You haven't downloaded the file
+ // Skip the test
+ return;
+ }
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Plus loin vers l'ouest", content);
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ }
+
+ /**
+ * This tests that we can handle without errors (but perhaps not
+ * all content) a file with a very very large ID3 frame that
+ * has been truncated before the end of the ID3 tags.
+ * In this case, it is a file with JPEG data in the ID3, which
+ * is trunacted before the end of the JPEG bit of the ID3 frame.
+ */
+ @Test
+ public void testTIKA474() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3truncated.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Check we could get the headers from the start
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("Girl you have no faith in medicine", content);
+ assertContains("The White Stripes", content);
+ assertContains("Elephant", content);
+ assertContains("2003", content);
+
+ // File lacks any audio frames, so we can't know these
+ assertEquals(null, metadata.get("version"));
+ assertEquals(null, metadata.get("samplerate"));
+ assertEquals(null, metadata.get("channels"));
+ }
+
+ // TIKA-1024
+ @Test
+ public void testNakedUTF16BOM() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testNakedUTF16BOM.mp3")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(XMPDM.GENRE));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 36c0efe..aeaf71e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -1,92 +1,92 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.ocr;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.junit.Test;
-
-public class TesseractOCRConfigTest extends TikaTest {
-
- @Test
- public void testNoConfig() throws Exception {
- TesseractOCRConfig config = new TesseractOCRConfig();
- assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
- assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
- assertEquals("Invalid default language value", "eng", config.getLanguage());
- assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
- assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
- assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
- assertEquals("Invalid default timeout value", 120, config.getTimeout());
- }
-
- @Test
- public void testPartialConfig() throws Exception {
-
- InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
- "/test-properties/TesseractOCRConfig-partial.properties");
-
- TesseractOCRConfig config = new TesseractOCRConfig(stream);
- assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
- assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
- assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
- assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
- assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
- assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
- assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
- }
-
- @Test
- public void testFullConfig() throws Exception {
-
- InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
- "/test-properties/TesseractOCRConfig-full.properties");
-
- TesseractOCRConfig config = new TesseractOCRConfig(stream);
- assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
- assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
- assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
- assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
- assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
- assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
- assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
- }
-
- @Test(expected=IllegalArgumentException.class)
- public void testValidateLanguage() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setLanguage("eng");
- config.setLanguage("eng+fra");
- assertTrue("Couldn't set valid values", true);
- config.setLanguage("rm -Rf *");
- }
-
- @Test(expected=IllegalArgumentException.class)
- public void testValidatePageSegMode() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setPageSegMode("0");
- config.setPageSegMode("10");
- assertTrue("Couldn't set valid values", true);
- config.setPageSegMode("11");
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+public class TesseractOCRConfigTest extends TikaTest {
+
+ @Test
+ public void testNoConfig() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+ assertEquals("Invalid default language value", "eng", config.getLanguage());
+ assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+ assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
+ assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid default timeout value", 120, config.getTimeout());
+ }
+
+ @Test
+ public void testPartialConfig() throws Exception {
+
+ InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+ "/test-properties/TesseractOCRConfig-partial.properties");
+
+ TesseractOCRConfig config = new TesseractOCRConfig(stream);
+ assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+ assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+ assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
+ assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+ assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+ }
+
+ @Test
+ public void testFullConfig() throws Exception {
+
+ InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
+ "/test-properties/TesseractOCRConfig-full.properties");
+
+ TesseractOCRConfig config = new TesseractOCRConfig(stream);
+ assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+ assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
+ assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
+ assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
+ assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
+ assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidateLanguage() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setLanguage("eng");
+ config.setLanguage("eng+fra");
+ assertTrue("Couldn't set valid values", true);
+ config.setLanguage("rm -Rf *");
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testValidatePageSegMode() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setPageSegMode("0");
+ config.setPageSegMode("10");
+ assertTrue("Couldn't set valid values", true);
+ config.setPageSegMode("11");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
index 147113e..d3a876e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
@@ -1,44 +1,44 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.video;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class FLVParserTest {
-
- @Test
- public void testFLV() throws Exception {
- String path = "/test-documents/testFLV.flv";
- Metadata metadata = new Metadata();
-
- String content = new Tika().parseToString(
- FLVParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("", content);
- assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("true", metadata.get("hasVideo"));
- assertEquals("false", metadata.get("stereo"));
- assertEquals("true", metadata.get("hasAudio"));
- assertEquals("120.0", metadata.get("height"));
- assertEquals("16.0", metadata.get("audiosamplesize"));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class FLVParserTest {
+
+ @Test
+ public void testFLV() throws Exception {
+ String path = "/test-documents/testFLV.flv";
+ Metadata metadata = new Metadata();
+
+ String content = new Tika().parseToString(
+ FLVParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("", content);
+ assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("true", metadata.get("hasVideo"));
+ assertEquals("false", metadata.get("stereo"));
+ assertEquals("true", metadata.get("hasAudio"));
+ assertEquals("120.0", metadata.get("height"));
+ assertEquals("16.0", metadata.get("audiosamplesize"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/pom.xml b/tika-parser-modules/tika-parser-office-module/pom.xml
index 4756328..4825076 100644
--- a/tika-parser-modules/tika-parser-office-module/pom.xml
+++ b/tika-parser-modules/tika-parser-office-module/pom.xml
@@ -1,126 +1,126 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-office-module</artifactId>
- <name>Apache Tika parser office module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>${poi.version}</version>
- <exclusions>
- <exclusion>
- <groupId>stax</groupId>
- <artifactId>stax-api</artifactId>
- </exclusion>
- <exclusion>
- <groupId>xml-apis</groupId>
- <artifactId>xml-apis</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>com.healthmarketscience.jackcess</groupId>
- <artifactId>jackcess</artifactId>
- <version>2.1.3</version>
- </dependency>
- <dependency>
- <groupId>com.healthmarketscience.jackcess</groupId>
- <artifactId>jackcess-encrypt</artifactId>
- <version>2.1.1</version>
- <exclusions>
- <exclusion>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <!-- PDFBox (in the tika-parser-pdf-module) and poi's ooxml
- code relies on bouncy castle, as does jackcess-encrypt
- Need to exclude the older library and include the newer one
- if there is a conflict.
- -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>com.pff</groupId>
- <artifactId>java-libpst</artifactId>
- <version>0.8.1</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-web-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-office-module</artifactId>
+ <name>Apache Tika parser office module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-scratchpad</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-ooxml</artifactId>
+ <version>${poi.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>stax</groupId>
+ <artifactId>stax-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.healthmarketscience.jackcess</groupId>
+ <artifactId>jackcess</artifactId>
+ <version>2.1.3</version>
+ </dependency>
+ <dependency>
+ <groupId>com.healthmarketscience.jackcess</groupId>
+ <artifactId>jackcess-encrypt</artifactId>
+ <version>2.1.1</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <!-- PDFBox (in the tika-parser-pdf-module) and poi's ooxml
+ code relies on bouncy castle, as does jackcess-encrypt
+ Need to exclude the older library and include the newer one
+ if there is a conflict.
+ -->
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.pff</groupId>
+ <artifactId>java-libpst</artifactId>
+ <version>0.8.1</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-web-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
index 32a41ab..8f34381 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/module/office/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.office.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.office.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
index c3e85c1..94c5aa5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java
@@ -1,112 +1,112 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserProxy;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ChmParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 5938777307516469802L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.ms-htmlhelp"),
- MediaType.application("chm"),
- MediaType.application("x-chm"))));
-
- private final Parser htmlProxy;
-
- public ChmParser() {
- this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
- }
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- ChmExtractor chmExtractor = new ChmExtractor(stream);
-
- // metadata
- metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
-
- // content
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
- final String entryName = entry.getName();
- if (entryName.endsWith(".html")
- || entryName.endsWith(".htm")
- ) {
-// AttributesImpl attrs = new AttributesImpl();
-// attrs.addAttribute("", "name", "name", "String", entryName);
-// xhtml.startElement("", "document", "document", attrs);
-
- byte[] data = chmExtractor.extractChmEntry(entry);
-
- parsePage(data, xhtml);
-
-// xhtml.endElement("", "", "document");
- }
- }
-
- xhtml.endDocument();
- }
-
-
- private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
- InputStream stream = null;
- Metadata metadata = new Metadata();
- ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
- ParseContext parser = new ParseContext();
- try {
- stream = new ByteArrayInputStream(byteObject);
- htmlProxy.parse(stream, handler, metadata, parser);
- } catch (SAXException e) {
- throw new RuntimeException(e);
- } catch (IOException e) {
- // Pushback overflow from tagsoup
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserProxy;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 5938777307516469802L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-htmlhelp"),
+ MediaType.application("chm"),
+ MediaType.application("x-chm"))));
+
+ private final Parser htmlProxy;
+
+ public ChmParser() {
+ this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser");
+ }
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ ChmExtractor chmExtractor = new ChmExtractor(stream);
+
+ // metadata
+ metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
+
+ // content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
+ final String entryName = entry.getName();
+ if (entryName.endsWith(".html")
+ || entryName.endsWith(".htm")
+ ) {
+// AttributesImpl attrs = new AttributesImpl();
+// attrs.addAttribute("", "name", "name", "String", entryName);
+// xhtml.startElement("", "document", "document", attrs);
+
+ byte[] data = chmExtractor.extractChmEntry(entry);
+
+ parsePage(data, xhtml);
+
+// xhtml.endElement("", "", "document");
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+
+ private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException
+ InputStream stream = null;
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
+ ParseContext parser = new ParseContext();
+ try {
+ stream = new ByteArrayInputStream(byteObject);
+ htmlProxy.parse(stream, handler, metadata, parser);
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ // Pushback overflow from tagsoup
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
index 42b0830..e8bf1cc 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
@@ -1,39 +1,39 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- *
- * Defines an accessor interface
- *
- * @param <T>
- */
-public interface ChmAccessor<T> extends Serializable {
- /**
- * Parses chm accessor
- *
- * @param data
- * chm file
- * @param chmAccessor
- * @throws TikaException
- */
- void parse(byte[] data, T chmAccessor) throws TikaException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ *
+ * Defines an accessor interface
+ *
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+ /**
+ * Parses chm accessor
+ *
+ * @param data
+ * chm file
+ * @param chmAccessor
+ * @throws TikaException
+ */
+ void parse(byte[] data, T chmAccessor) throws TikaException;
+}
[15/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 9d9d372..bfec2ad 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -1,506 +1,506 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Ignore;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class WordParserTest extends TikaTest {
-
- @Test
- public void testWordParser() throws Exception {
- try (InputStream input = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD.doc")) {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- assertContains("Sample Word Document", handler.toString());
- }
- }
-
- @Test
- public void testWordWithWAV() throws Exception {
- try (InputStream input = WordParserTest.class.getResourceAsStream(
- "/test-documents/Doc1_ole.doc")) {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertContains("MSj00974840000[1].wav", handler.toString());
- }
- }
-
- /**
- * Test that the word converter is able to generate the
- * correct HTML for the document
- */
- @Test
- public void testWordHTML() throws Exception {
-
- // Try with a document containing various tables and
- // formattings
- XMLResult result = getXML("testWORD.doc");
- String xml = result.xml;
- Metadata metadata = result.metadata;
-
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- assertTrue(xml.contains("Sample Word Document"));
-
- // Check that custom headings came through
- assertTrue(xml.contains("<h1 class=\"title\">"));
- // Regular headings
- assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
- assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
- // Bold and italic
- assertTrue(xml.contains("<b>BOLD</b>"));
- assertTrue(xml.contains("<i>ITALIC</i>"));
- // Table
- assertTrue(xml.contains("<table>"));
- assertTrue(xml.contains("<td>"));
- // TODO - Check for the nested table
- // Links
- assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
- // Paragraphs with other styles
- assertTrue(xml.contains("<p class=\"signature\">This one"));
-
- // Try with a document that contains images
- xml = getXML("testWORD_3imgs.doc").xml;
-
- // Images 1-3
- assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\""));
- assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\""));
- assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\""));
-
- // Text too
- assertTrue(xml.contains("<p>The end!"));
-
- // TIKA-692: test document containing multiple
- // character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs.doc").xml;
-
- // Make sure bold text arrived as single
- // contiguous string even though Word parser
- // handled this as 3 character runs
- assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
-
- // TIKA-692: test document containing multiple
- // character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs2.doc").xml;
-
- // Make sure bold text arrived as single
- // contiguous string even though Word parser
- // handled this as 3 character runs
- assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
- }
-
- @Test
- public void testEmbeddedNames() throws Exception {
- String result = getXML("testWORD_embedded_pdf.doc").xml;
-
- // Make sure the embedded div comes out after "Here
- // is the pdf file" and before "Bye Bye":
- int i = result.indexOf("Here is the pdf file:");
- assertTrue(i != -1);
- int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
- assertTrue(j != -1);
- int k = result.indexOf("Bye Bye");
- assertTrue(k != -1);
-
- assertTrue(i < j);
- assertTrue(j < k);
- }
-
- // TIKA-982
- @Test
- public void testEmbeddedRTF() throws Exception {
- String result = getXML("testWORD_embedded_rtf.doc").xml;
- assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />"));
- assertTrue(result.contains("_1404039792.rtf"));
- }
-
- // TIKA-1019
- @Test
- public void testDocumentLink() throws Exception {
- String result = getXML("testDocumentLink.doc").xml;
- assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />"));
- assertTrue(result.contains("_1327495610.unknown"));
- }
-
- @Test
- public void testWord6Parser() throws Exception {
- try (InputStream input = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD6.doc")) {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
- assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
- assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
- }
- }
-
- @Test
- public void testVarious() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_various.doc")) {
- new OfficeParser().parse(stream, handler, metadata, new ParseContext());
- }
-
- String content = handler.toString();
- //content = content.replaceAll("\\s+"," ");
- assertContains("Footnote appears here", content);
- assertContains("This is a footnote.", content);
- assertContains("This is the header text.", content);
- assertContains("This is the footer text.", content);
- assertContains("Here is a text box", content);
- assertContains("Bold", content);
- assertContains("italic", content);
- assertContains("underline", content);
- assertContains("superscript", content);
- assertContains("subscript", content);
- assertContains("Here is a citation:", content);
- assertContains("Figure 1 This is a caption for Figure 1", content);
- assertContains("(Kramer)", content);
- assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
- assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
- assertContains("This is a hyperlink", content);
- assertContains("Here is a list:", content);
- for(int row=1;row<=3;row++) {
- //assertContains("�\tBullet " + row, content);
- //assertContains("\u00b7\tBullet " + row, content);
- assertContains("Bullet " + row, content);
- }
- assertContains("Here is a numbered list:", content);
- for(int row=1;row<=3;row++) {
- //assertContains(row + ")\tNumber bullet " + row, content);
- //assertContains(row + ") Number bullet " + row, content);
- // TODO: WordExtractor fails to number the bullets:
- assertContains("Number bullet " + row, content);
- }
-
- for(int row=1;row<=2;row++) {
- for(int col=1;col<=3;col++) {
- assertContains("Row " + row + " Col " + col, content);
- }
- }
-
- assertContains("Keyword1 Keyword2", content);
- assertEquals("Keyword1 Keyword2",
- metadata.get(TikaCoreProperties.KEYWORDS));
-
- assertContains("Subject is here", content);
- // TODO: Move to OO subject in Tika 2.0
- assertEquals("Subject is here",
- metadata.get(Metadata.SUBJECT));
- assertEquals("Subject is here",
- metadata.get(OfficeOpenXMLCore.SUBJECT));
-
- assertContains("Suddenly some Japanese text:", content);
- // Special version of (GHQ)
- assertContains("\uff08\uff27\uff28\uff31\uff09", content);
- // 6 other characters
- assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
-
- assertContains("And then some Gothic text:", content);
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
- }
-
- /**
- * TIKA-1044 - Handle documents where parts of the
- * text have no formatting or styles applied to them
- */
- @Test
- public void testNoFormat() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_no_format.doc")) {
- new OfficeParser().parse(stream, handler, metadata, new ParseContext());
- }
-
- String content = handler.toString();
- assertContains("Will generate an exception", content);
- }
-
- /**
- * Ensures that custom OLE2 (HPSF) properties are extracted
- */
- @Test
- public void testCustomProperties() throws Exception {
- Metadata metadata = new Metadata();
-
- try (InputStream input = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_custom_props.doc")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
- }
-
- assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
- assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
- assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
- assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("1", metadata.get(Office.PAGE_COUNT));
- assertEquals("2", metadata.get(Office.WORD_COUNT));
- assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
- assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
- assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
- // TODO: Move to OO subject in Tika 2.0
- assertEquals("My subject", metadata.get(Metadata.SUBJECT));
- assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
- }
-
- @Test
- public void testExceptions1() throws Exception {
- XMLResult xml;
- Level logLevelStart = Logger.getRootLogger().getLevel();
- Logger.getRootLogger().setLevel(Level.ERROR);
- try {
- xml = getXML("testException1.doc");
- assertContains("total population", xml.xml);
- xml = getXML("testException2.doc");
- assertContains("electric charge", xml.xml);
- } finally {
- Logger.getRootLogger().setLevel(logLevelStart);
- }
- }
-
- @Test
- public void testTabularSymbol() throws Exception {
- assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
- }
-
- /**
- * TIKA-1229 Hyperlinks in Headers should be output as such,
- * not plain text with control characters
- */
- @Test
- public void testHeaderHyperlinks() throws Exception {
- XMLResult result = getXML("testWORD_header_hyperlink.doc");
- String xml = result.xml;
- Metadata metadata = result.metadata;
-
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
- assertContains("example.com", xml);
-
- // Check we don't have the special text HYPERLINK
- assertFalse(xml.contains("HYPERLINK"));
-
- // Check we do have the link
- assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
-
- // Check we do have the email
- assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
- }
-
- @Test
- public void testControlCharacter() throws Exception {
- assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
- }
-
- @Test
- public void testParagraphsAfterTables() throws Exception {
- XMLResult result = getXML("test_TIKA-1251.doc");
-
- String xml = result.xml;
- Metadata metadata = result.metadata;
-
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
-
- assertContains("<p>1. Organisering av vakten:</p>", xml);
-
- }
-
- @Test
- public void testHyperlinkStringIOOBESmartQuote() throws Exception {
- //TIKA-1512, one cause: closing double quote is a smart quote
- //test file contributed by user
- XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
- assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml);
- }
-
- @Test
- @Ignore //until we determine whether we can include test docs or not
- public void testHyperlinkStringLongNoCloseQuote() throws Exception {
- //TIKA-1512, one cause: no closing quote on really long string
- //test file derived from govdocs1 012152.doc
- XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
- assertContains("href=\"http://www.lexis.com", result.xml);
- }
-
- @Test
- @Ignore //until we determine whether we can include test docs or not
- public void testHyperlinkStringLongCarriageReturn() throws Exception {
- //TIKA-1512, one cause: no closing quote, but carriage return
- //test file derived from govdocs1 040044.doc
- XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
- assertContains("href=\"http://www.nib.org", result.xml);
- }
-
- @Test
- public void testDOCParagraphNumbering() throws Exception {
- String xml = getXML("testWORD_numbered_list.doc").xml;
- assertContains("1) This", xml);
- assertContains("a) Is", xml);
- assertContains("i) A multi", xml);
- assertContains("ii) Level", xml);
- assertContains("1. Within cell 1", xml);
- assertContains("b. Cell b", xml);
- assertContains("iii) List", xml);
- assertContains("2) foo", xml);
- assertContains("ii) baz", xml);
- assertContains("ii) foo", xml);
- assertContains("II. bar", xml);
- assertContains("6. six", xml);
- assertContains("7. seven", xml);
- assertContains("a. seven a", xml);
- assertContains("e. seven e", xml);
- assertContains("2. A ii 2", xml);
- assertContains("3. page break list 3", xml);
- assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
- assertContains("1.1.1. 1.1.1", xml);
- assertContains("1.1. 1.2->1.1 //set the value", xml);
-
- assertContains("add a list here", xml);
- //TODO: not currently pulling numbers out of comments
- assertContains(">comment list 1", xml);
-
- }
-
- @Test
- public void testDOCOverrideParagraphNumbering() throws Exception {
- String xml = getXML("testWORD_override_list_numbering.doc").xml;
-
- //Test 1
- assertContains("1.1.1.1...1 1.1.1.1...1", xml);
- assertContains("1st.2.3someText 1st.2.3someText", xml);
- assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
- assertContains("5th 5th", xml);
-
-
- //Test 2
- assertContains("1.a.I 1.a.I", xml);
- //test no reset because level 2 is not sufficient to reset
- assertContains("1.b.III 1.b.III", xml);
- //test restarted because of level 0's increment to 2
- assertContains("2.a.I 2.a.I", xml);
- //test handling of skipped level
- assertContains("2.b 2.b", xml);
-
- //Test 3
- assertContains("(1)) (1))", xml);
- //tests start level 1 at 17 and
- assertContains("2.17 2.17", xml);
- //tests that isLegal turns everything into decimal
- assertContains("2.18.2.1 2.18.2.1", xml);
- assertContains(">2 2", xml);
-
- //Test4
- assertContains(">1 1", xml);
- assertContains(">A A", xml);
- assertContains(">B B", xml);
- assertContains(">C C", xml);
- assertContains(">4 4", xml);
-
- //Test5
- assertContains(">00 00", xml);
- assertContains(">01 01", xml);
- assertContains(">01. 01.", xml);
- assertContains(">01..1 01..1", xml);
- assertContains(">02 02", xml);
- }
-
- @Test
- public void testMultiAuthorsManagers() throws Exception {
- XMLResult r = getXML("testWORD_multi_authors.doc");
- String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
- assertEquals(3, authors.length);
- assertEquals("author2", authors[1]);
-
- String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
- assertEquals(2, managers.length);
- assertEquals("manager1", managers[0]);
- assertEquals("manager2", managers[1]);
- }
-
- @Test
- public void testOrigLocation() throws Exception {
- Metadata metadata = getXML("testException2.doc").metadata;
- List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
- assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
- assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
- }
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class WordParserTest extends TikaTest {
+
+ @Test
+ public void testWordParser() throws Exception {
+ try (InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD.doc")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertContains("Sample Word Document", handler.toString());
+ }
+ }
+
+ @Test
+ public void testWordWithWAV() throws Exception {
+ try (InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/Doc1_ole.doc")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertContains("MSj00974840000[1].wav", handler.toString());
+ }
+ }
+
+ /**
+ * Test that the word converter is able to generate the
+ * correct HTML for the document
+ */
+ @Test
+ public void testWordHTML() throws Exception {
+
+ // Try with a document containing various tables and
+ // formattings
+ XMLResult result = getXML("testWORD.doc");
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(xml.contains("Sample Word Document"));
+
+ // Check that custom headings came through
+ assertTrue(xml.contains("<h1 class=\"title\">"));
+ // Regular headings
+ assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+ assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+ // Bold and italic
+ assertTrue(xml.contains("<b>BOLD</b>"));
+ assertTrue(xml.contains("<i>ITALIC</i>"));
+ // Table
+ assertTrue(xml.contains("<table>"));
+ assertTrue(xml.contains("<td>"));
+ // TODO - Check for the nested table
+ // Links
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Paragraphs with other styles
+ assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+ // Try with a document that contains images
+ xml = getXML("testWORD_3imgs.doc").xml;
+
+ // Images 1-3
+ assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\""));
+ assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\""));
+ assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\""));
+
+ // Text too
+ assertTrue(xml.contains("<p>The end!"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs.doc").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs2.doc").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ }
+
+ @Test
+ public void testEmbeddedNames() throws Exception {
+ String result = getXML("testWORD_embedded_pdf.doc").xml;
+
+ // Make sure the embedded div comes out after "Here
+ // is the pdf file" and before "Bye Bye":
+ int i = result.indexOf("Here is the pdf file:");
+ assertTrue(i != -1);
+ int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
+ assertTrue(j != -1);
+ int k = result.indexOf("Bye Bye");
+ assertTrue(k != -1);
+
+ assertTrue(i < j);
+ assertTrue(j < k);
+ }
+
+ // TIKA-982
+ @Test
+ public void testEmbeddedRTF() throws Exception {
+ String result = getXML("testWORD_embedded_rtf.doc").xml;
+ assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />"));
+ assertTrue(result.contains("_1404039792.rtf"));
+ }
+
+ // TIKA-1019
+ @Test
+ public void testDocumentLink() throws Exception {
+ String result = getXML("testDocumentLink.doc").xml;
+ assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />"));
+ assertTrue(result.contains("_1327495610.unknown"));
+ }
+
+ @Test
+ public void testWord6Parser() throws Exception {
+ try (InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD6.doc")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+ assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+ assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
+ }
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_various.doc")) {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ }
+
+ String content = handler.toString();
+ //content = content.replaceAll("\\s+"," ");
+ assertContains("Footnote appears here", content);
+ assertContains("This is a footnote.", content);
+ assertContains("This is the header text.", content);
+ assertContains("This is the footer text.", content);
+ assertContains("Here is a text box", content);
+ assertContains("Bold", content);
+ assertContains("italic", content);
+ assertContains("underline", content);
+ assertContains("superscript", content);
+ assertContains("subscript", content);
+ assertContains("Here is a citation:", content);
+ assertContains("Figure 1 This is a caption for Figure 1", content);
+ assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+ assertContains("This is a hyperlink", content);
+ assertContains("Here is a list:", content);
+ for(int row=1;row<=3;row++) {
+ //assertContains("�\tBullet " + row, content);
+ //assertContains("\u00b7\tBullet " + row, content);
+ assertContains("Bullet " + row, content);
+ }
+ assertContains("Here is a numbered list:", content);
+ for(int row=1;row<=3;row++) {
+ //assertContains(row + ")\tNumber bullet " + row, content);
+ //assertContains(row + ") Number bullet " + row, content);
+ // TODO: WordExtractor fails to number the bullets:
+ assertContains("Number bullet " + row, content);
+ }
+
+ for(int row=1;row<=2;row++) {
+ for(int col=1;col<=3;col++) {
+ assertContains("Row " + row + " Col " + col, content);
+ }
+ }
+
+ assertContains("Keyword1 Keyword2", content);
+ assertEquals("Keyword1 Keyword2",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+
+ assertContains("Subject is here", content);
+ // TODO: Move to OO subject in Tika 2.0
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("Subject is here",
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+ assertContains("Suddenly some Japanese text:", content);
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+ // 6 other characters
+ assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+ assertContains("And then some Gothic text:", content);
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ }
+
+ /**
+ * TIKA-1044 - Handle documents where parts of the
+ * text have no formatting or styles applied to them
+ */
+ @Test
+ public void testNoFormat() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_no_format.doc")) {
+ new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+ }
+
+ String content = handler.toString();
+ assertContains("Will generate an exception", content);
+ }
+
+ /**
+ * Ensures that custom OLE2 (HPSF) properties are extracted
+ */
+ @Test
+ public void testCustomProperties() throws Exception {
+ Metadata metadata = new Metadata();
+
+ try (InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_custom_props.doc")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OfficeParser().parse(input, handler, metadata, context);
+ }
+
+ assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+ assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("2", metadata.get(Office.WORD_COUNT));
+ assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+ assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
+ // TODO: Move to OO subject in Tika 2.0
+ assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
+ }
+
+ @Test
+ public void testExceptions1() throws Exception {
+ XMLResult xml;
+ Level logLevelStart = Logger.getRootLogger().getLevel();
+ Logger.getRootLogger().setLevel(Level.ERROR);
+ try {
+ xml = getXML("testException1.doc");
+ assertContains("total population", xml.xml);
+ xml = getXML("testException2.doc");
+ assertContains("electric charge", xml.xml);
+ } finally {
+ Logger.getRootLogger().setLevel(logLevelStart);
+ }
+ }
+
+ @Test
+ public void testTabularSymbol() throws Exception {
+ assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
+ }
+
+ /**
+ * TIKA-1229 Hyperlinks in Headers should be output as such,
+ * not plain text with control characters
+ */
+ @Test
+ public void testHeaderHyperlinks() throws Exception {
+ XMLResult result = getXML("testWORD_header_hyperlink.doc");
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
+ assertContains("example.com", xml);
+
+ // Check we don't have the special text HYPERLINK
+ assertFalse(xml.contains("HYPERLINK"));
+
+ // Check we do have the link
+ assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
+
+ // Check we do have the email
+ assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
+ }
+
+ @Test
+ public void testControlCharacter() throws Exception {
+ assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+ }
+
+ @Test
+ public void testParagraphsAfterTables() throws Exception {
+ XMLResult result = getXML("test_TIKA-1251.doc");
+
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertContains("<p>1. Organisering av vakten:</p>", xml);
+
+ }
+
+ @Test
+ public void testHyperlinkStringIOOBESmartQuote() throws Exception {
+ //TIKA-1512, one cause: closing double quote is a smart quote
+ //test file contributed by user
+ XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
+ assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml);
+ }
+
+ @Test
+ @Ignore //until we determine whether we can include test docs or not
+ public void testHyperlinkStringLongNoCloseQuote() throws Exception {
+ //TIKA-1512, one cause: no closing quote on really long string
+ //test file derived from govdocs1 012152.doc
+ XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
+ assertContains("href=\"http://www.lexis.com", result.xml);
+ }
+
+ @Test
+ @Ignore //until we determine whether we can include test docs or not
+ public void testHyperlinkStringLongCarriageReturn() throws Exception {
+ //TIKA-1512, one cause: no closing quote, but carriage return
+ //test file derived from govdocs1 040044.doc
+ XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
+ assertContains("href=\"http://www.nib.org", result.xml);
+ }
+
+ @Test
+ public void testDOCParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_numbered_list.doc").xml;
+ assertContains("1) This", xml);
+ assertContains("a) Is", xml);
+ assertContains("i) A multi", xml);
+ assertContains("ii) Level", xml);
+ assertContains("1. Within cell 1", xml);
+ assertContains("b. Cell b", xml);
+ assertContains("iii) List", xml);
+ assertContains("2) foo", xml);
+ assertContains("ii) baz", xml);
+ assertContains("ii) foo", xml);
+ assertContains("II. bar", xml);
+ assertContains("6. six", xml);
+ assertContains("7. seven", xml);
+ assertContains("a. seven a", xml);
+ assertContains("e. seven e", xml);
+ assertContains("2. A ii 2", xml);
+ assertContains("3. page break list 3", xml);
+ assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+ assertContains("1.1.1. 1.1.1", xml);
+ assertContains("1.1. 1.2->1.1 //set the value", xml);
+
+ assertContains("add a list here", xml);
+ //TODO: not currently pulling numbers out of comments
+ assertContains(">comment list 1", xml);
+
+ }
+
+ @Test
+ public void testDOCOverrideParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_override_list_numbering.doc").xml;
+
+ //Test 1
+ assertContains("1.1.1.1...1 1.1.1.1...1", xml);
+ assertContains("1st.2.3someText 1st.2.3someText", xml);
+ assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+ assertContains("5th 5th", xml);
+
+
+ //Test 2
+ assertContains("1.a.I 1.a.I", xml);
+ //test no reset because level 2 is not sufficient to reset
+ assertContains("1.b.III 1.b.III", xml);
+ //test restarted because of level 0's increment to 2
+ assertContains("2.a.I 2.a.I", xml);
+ //test handling of skipped level
+ assertContains("2.b 2.b", xml);
+
+ //Test 3
+ assertContains("(1)) (1))", xml);
+ //tests start level 1 at 17 and
+ assertContains("2.17 2.17", xml);
+ //tests that isLegal turns everything into decimal
+ assertContains("2.18.2.1 2.18.2.1", xml);
+ assertContains(">2 2", xml);
+
+ //Test4
+ assertContains(">1 1", xml);
+ assertContains(">A A", xml);
+ assertContains(">B B", xml);
+ assertContains(">C C", xml);
+ assertContains(">4 4", xml);
+
+ //Test5
+ assertContains(">00 00", xml);
+ assertContains(">01 01", xml);
+ assertContains(">01. 01.", xml);
+ assertContains(">01..1 01..1", xml);
+ assertContains(">02 02", xml);
+ }
+
+ @Test
+ public void testMultiAuthorsManagers() throws Exception {
+ XMLResult r = getXML("testWORD_multi_authors.doc");
+ String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+ assertEquals(3, authors.length);
+ assertEquals("author2", authors[1]);
+
+ String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+ assertEquals(2, managers.length);
+ assertEquals("manager1", managers[0]);
+ assertEquals("manager2", managers[1]);
+ }
+
+ @Test
+ public void testOrigLocation() throws Exception {
+ Metadata metadata = getXML("testException2.doc").metadata;
+ List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+ assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 24551bc..15f0c74 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -1,340 +1,340 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.opendocument.OpenOfficeParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ODFParserTest extends TikaTest {
- /**
- * For now, allow us to run some tests against both
- * the old and the new parser
- */
- private Parser[] getParsers() {
- return new Parser[] {
- new OpenDocumentParser(),
- new OpenOfficeParser()
- };
- }
-
- @Test
- public void testOO3() throws Exception {
- for (Parser parser : getParsers()) {
- XMLResult r = getXML("testODFwithOOo3.odt", parser);
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
- assertContains("Tika is part of the Lucene project.", content);
- assertContains("Solr", content);
- assertContains("one embedded", content);
- assertContains("Rectangle Title", content);
- assertContains("a blue background and dark border", content);
-
- }
- }
-
- @Test
- public void testOO2() throws Exception {
- for (Parser parser : getParsers()) {
- XMLResult r = getXML("testOpenOffice2.odt", parser);
- Metadata metadata = r.metadata;
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
- assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
- assertEquals(
- "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
- metadata.get("generator"));
-
- // Check date metadata, both old-style and new-style
- assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
- assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
- assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
-
- // Check the document statistics
- assertEquals("1", metadata.get(Office.PAGE_COUNT));
- assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
- assertEquals("14", metadata.get(Office.WORD_COUNT));
- assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Office.TABLE_COUNT));
- assertEquals("0", metadata.get(Office.OBJECT_COUNT));
- assertEquals("0", metadata.get(Office.IMAGE_COUNT));
-
- // Check the Tika-1.0 style document statistics
- assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
- assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals("14", metadata.get(Metadata.WORD_COUNT));
- assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
- assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
- assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
-
- // Check the very old style statistics (these will be removed shortly)
- assertEquals("0", metadata.get("nbTab"));
- assertEquals("0", metadata.get("nbObject"));
- assertEquals("0", metadata.get("nbImg"));
- assertEquals("1", metadata.get("nbPage"));
- assertEquals("1", metadata.get("nbPara"));
- assertEquals("14", metadata.get("nbWord"));
- assertEquals("78", metadata.get("nbCharacter"));
-
- // Custom metadata tags present but without values
- assertEquals(null, metadata.get("custom:Info 1"));
- assertEquals(null, metadata.get("custom:Info 2"));
- assertEquals(null, metadata.get("custom:Info 3"));
- assertEquals(null, metadata.get("custom:Info 4"));
-
- assertContains(
- "This is a sample Open Office document,"
- + " written in NeoOffice 2.2.1 for the Mac.",
- r.xml);
-
- }
- }
-
- /**
- * Similar to {@link #testOO2()}, but using a different
- * OO2 file with different metadata in it
- */
- @Test
- public void testOO2Metadata() throws Exception {
- XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser());
- Metadata metadata = r.metadata;
- assertEquals(
- "application/vnd.oasis.opendocument.formula",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
- assertEquals("The quick brown fox jumps over the lazy dog",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(Metadata.SUBJECT));
- assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
- assertEquals("1", metadata.get("editing-cycles"));
- assertEquals(
- "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
- metadata.get("generator"));
- assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
-
- // User defined metadata
- assertEquals("Text 1", metadata.get("custom:Info 1"));
- assertEquals("2", metadata.get("custom:Info 2"));
- assertEquals("false", metadata.get("custom:Info 3"));
- assertEquals("true", metadata.get("custom:Info 4"));
-
- // No statistics present
- assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
- assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals(null, metadata.get(Metadata.WORD_COUNT));
- assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
- assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
- assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
- assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
- assertEquals(null, metadata.get("nbTab"));
- assertEquals(null, metadata.get("nbObject"));
- assertEquals(null, metadata.get("nbImg"));
- assertEquals(null, metadata.get("nbPage"));
- assertEquals(null, metadata.get("nbPara"));
- assertEquals(null, metadata.get("nbWord"));
- assertEquals(null, metadata.get("nbCharacter"));
-
- // Note - contents of maths files not currently supported
- assertContains("<body />", r.xml);
-
- }
-
- /**
- * Similar to {@link #testOO2()} )}, but using an OO3 file
- */
- @Test
- public void testOO3Metadata() throws Exception {
- XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser());
- Metadata metadata = r.metadata;
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
- assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Test document", metadata.get(Metadata.SUBJECT));
- assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Bart Hanssens", metadata.get("initial-creator"));
- assertEquals("2", metadata.get("editing-cycles"));
- assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
- assertEquals(
- "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
- metadata.get("generator"));
- assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
-
- // User defined metadata
- assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
- assertEquals(null, metadata.get("custom:Info 2"));
- assertEquals(null, metadata.get("custom:Info 3"));
- assertEquals(null, metadata.get("custom:Info 4"));
-
- // Check the document statistics
- assertEquals("2", metadata.get(Office.PAGE_COUNT));
- assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
- assertEquals("54", metadata.get(Office.WORD_COUNT));
- assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Office.TABLE_COUNT));
- assertEquals("2", metadata.get(Office.OBJECT_COUNT));
- assertEquals("0", metadata.get(Office.IMAGE_COUNT));
-
- // Check the Tika-1.0 style document statistics
- assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
- assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals("54", metadata.get(Metadata.WORD_COUNT));
- assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
- assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
- assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
-
- // Check the old style statistics (these will be removed shortly)
- assertEquals("0", metadata.get("nbTab"));
- assertEquals("2", metadata.get("nbObject"));
- assertEquals("0", metadata.get("nbImg"));
- assertEquals("2", metadata.get("nbPage"));
- assertEquals("13", metadata.get("nbPara"));
- assertEquals("54", metadata.get("nbWord"));
- assertEquals("351", metadata.get("nbCharacter"));
-
- assertContains(
- "Tika is part of the Lucene project.", r.xml);
-
-
- }
-
- @Test
- public void testODPMasterFooter() throws Exception {
- assertContains("Master footer is here",
- getXML("testMasterFooter.odp").xml);
- }
-
- @Test
- public void testODTFooter() throws Exception {
- XMLResult r = getXML("testFooter.odt");
- assertContains("Here is some text...", r.xml);
- assertContains("Here is some text on page 2", r.xml);
- assertContains("Here is footer text", r.xml);
- }
-
- @Test
- public void testODSFooter() throws Exception {
- assertContains("Here is a footer in the center area",
- getXML("testFooter.ods").xml);
-
- }
-
- @Test
- public void testFromFile() throws Exception {
- OpenDocumentParser parser = new OpenDocumentParser();
- Path tmp = null;
- try {
- tmp = Files.createTempFile("test-odf-", ".odt");
- Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp,
- StandardCopyOption.REPLACE_EXISTING);
- Metadata metadata = new Metadata();
- TikaInputStream tis = TikaInputStream.get(tmp, metadata);
- assertEquals(true, tis.hasFile());
- ContentHandler handler = new BodyContentHandler();
- parser.parse(tis, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
-
- String content = handler.toString();
- assertContains("Tika is part of the Lucene project.", content);
- } finally {
- Files.delete(tmp);
- }
- }
-
- @Test
- public void testNPEFromFile() throws Exception {
- XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser());
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- assertContains("primero hay que generar un par de claves", r.xml);
-
- }
-
- // TIKA-1063: Test basic style support.
- @Test
- public void testODTStyles() throws Exception {
- String xml = getXML("testStyles.odt").xml;
- assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
- assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
- assertContains("<ol>\t<li><p>One</p>", xml);
- assertContains("</ol>", xml);
- assertContains("<ul>\t<li><p>First</p>", xml);
- assertContains("</ul>", xml);
- }
-
- //TIKA-1600: Test that null pointer doesn't break parsing.
- @Test
- public void testNullStylesInODTFooter() throws Exception {
-
- XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext());
-
- assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
-
- assertContains("Utilisation de ce document", content);
- assertContains("Copyright and License", content);
- assertContains("Changer la langue", content);
- assertContains("La page d\u2019accueil permet de faire une recherche simple", content);
-
- }
- @Test //TIKA-1916
- public void testMissingMeta() throws Exception {
- String xml = getXML("testODTNoMeta.odt").xml;
- assertContains("Test text", xml);
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.junit.Assert.assertEquals;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.opendocument.OpenOfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ODFParserTest extends TikaTest {
+ /**
+ * For now, allow us to run some tests against both
+ * the old and the new parser
+ */
+ private Parser[] getParsers() {
+ return new Parser[] {
+ new OpenDocumentParser(),
+ new OpenOfficeParser()
+ };
+ }
+
+ @Test
+ public void testOO3() throws Exception {
+ for (Parser parser : getParsers()) {
+ XMLResult r = getXML("testODFwithOOo3.odt", parser);
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+ assertContains("Tika is part of the Lucene project.", content);
+ assertContains("Solr", content);
+ assertContains("one embedded", content);
+ assertContains("Rectangle Title", content);
+ assertContains("a blue background and dark border", content);
+
+ }
+ }
+
+ @Test
+ public void testOO2() throws Exception {
+ for (Parser parser : getParsers()) {
+ XMLResult r = getXML("testOpenOffice2.odt", parser);
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
+ assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals(
+ "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
+ metadata.get("generator"));
+
+ // Check date metadata, both old-style and new-style
+ assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
+ assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
+ assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
+
+ // Check the document statistics
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("14", metadata.get(Office.WORD_COUNT));
+ assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Office.TABLE_COUNT));
+ assertEquals("0", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+ // Check the Tika-1.0 style document statistics
+ assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals("14", metadata.get(Metadata.WORD_COUNT));
+ assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+ assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
+
+ // Check the very old style statistics (these will be removed shortly)
+ assertEquals("0", metadata.get("nbTab"));
+ assertEquals("0", metadata.get("nbObject"));
+ assertEquals("0", metadata.get("nbImg"));
+ assertEquals("1", metadata.get("nbPage"));
+ assertEquals("1", metadata.get("nbPara"));
+ assertEquals("14", metadata.get("nbWord"));
+ assertEquals("78", metadata.get("nbCharacter"));
+
+ // Custom metadata tags present but without values
+ assertEquals(null, metadata.get("custom:Info 1"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
+
+ assertContains(
+ "This is a sample Open Office document,"
+ + " written in NeoOffice 2.2.1 for the Mac.",
+ r.xml);
+
+ }
+ }
+
+ /**
+ * Similar to {@link #testOO2()}, but using a different
+ * OO2 file with different metadata in it
+ */
+ @Test
+ public void testOO2Metadata() throws Exception {
+ XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser());
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.oasis.opendocument.formula",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals("1", metadata.get("editing-cycles"));
+ assertEquals(
+ "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
+ metadata.get("generator"));
+ assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Text 1", metadata.get("custom:Info 1"));
+ assertEquals("2", metadata.get("custom:Info 2"));
+ assertEquals("false", metadata.get("custom:Info 3"));
+ assertEquals("true", metadata.get("custom:Info 4"));
+
+ // No statistics present
+ assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
+ assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals(null, metadata.get(Metadata.WORD_COUNT));
+ assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
+ assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
+ assertEquals(null, metadata.get("nbTab"));
+ assertEquals(null, metadata.get("nbObject"));
+ assertEquals(null, metadata.get("nbImg"));
+ assertEquals(null, metadata.get("nbPage"));
+ assertEquals(null, metadata.get("nbPara"));
+ assertEquals(null, metadata.get("nbWord"));
+ assertEquals(null, metadata.get("nbCharacter"));
+
+ // Note - contents of maths files not currently supported
+ assertContains("<body />", r.xml);
+
+ }
+
+ /**
+ * Similar to {@link #testOO2()} )}, but using an OO3 file
+ */
+ @Test
+ public void testOO3Metadata() throws Exception {
+ XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser());
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Test document", metadata.get(Metadata.SUBJECT));
+ assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Bart Hanssens", metadata.get("initial-creator"));
+ assertEquals("2", metadata.get("editing-cycles"));
+ assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals(
+ "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+ metadata.get("generator"));
+ assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
+
+ // Check the document statistics
+ assertEquals("2", metadata.get(Office.PAGE_COUNT));
+ assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("54", metadata.get(Office.WORD_COUNT));
+ assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Office.TABLE_COUNT));
+ assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+ // Check the Tika-1.0 style document statistics
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals("54", metadata.get(Metadata.WORD_COUNT));
+ assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+ assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
+
+ // Check the old style statistics (these will be removed shortly)
+ assertEquals("0", metadata.get("nbTab"));
+ assertEquals("2", metadata.get("nbObject"));
+ assertEquals("0", metadata.get("nbImg"));
+ assertEquals("2", metadata.get("nbPage"));
+ assertEquals("13", metadata.get("nbPara"));
+ assertEquals("54", metadata.get("nbWord"));
+ assertEquals("351", metadata.get("nbCharacter"));
+
+ assertContains(
+ "Tika is part of the Lucene project.", r.xml);
+
+
+ }
+
+ @Test
+ public void testODPMasterFooter() throws Exception {
+ assertContains("Master footer is here",
+ getXML("testMasterFooter.odp").xml);
+ }
+
+ @Test
+ public void testODTFooter() throws Exception {
+ XMLResult r = getXML("testFooter.odt");
+ assertContains("Here is some text...", r.xml);
+ assertContains("Here is some text on page 2", r.xml);
+ assertContains("Here is footer text", r.xml);
+ }
+
+ @Test
+ public void testODSFooter() throws Exception {
+ assertContains("Here is a footer in the center area",
+ getXML("testFooter.ods").xml);
+
+ }
+
+ @Test
+ public void testFromFile() throws Exception {
+ OpenDocumentParser parser = new OpenDocumentParser();
+ Path tmp = null;
+ try {
+ tmp = Files.createTempFile("test-odf-", ".odt");
+ Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp,
+ StandardCopyOption.REPLACE_EXISTING);
+ Metadata metadata = new Metadata();
+ TikaInputStream tis = TikaInputStream.get(tmp, metadata);
+ assertEquals(true, tis.hasFile());
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(tis, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ assertContains("Tika is part of the Lucene project.", content);
+ } finally {
+ Files.delete(tmp);
+ }
+ }
+
+ @Test
+ public void testNPEFromFile() throws Exception {
+ XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser());
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ assertContains("primero hay que generar un par de claves", r.xml);
+
+ }
+
+ // TIKA-1063: Test basic style support.
+ @Test
+ public void testODTStyles() throws Exception {
+ String xml = getXML("testStyles.odt").xml;
+ assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
+ assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
+ assertContains("<ol>\t<li><p>One</p>", xml);
+ assertContains("</ol>", xml);
+ assertContains("<ul>\t<li><p>First</p>", xml);
+ assertContains("</ul>", xml);
+ }
+
+ //TIKA-1600: Test that null pointer doesn't break parsing.
+ @Test
+ public void testNullStylesInODTFooter() throws Exception {
+
+ XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext());
+
+ assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ assertContains("Utilisation de ce document", content);
+ assertContains("Copyright and License", content);
+ assertContains("Changer la langue", content);
+ assertContains("La page d\u2019accueil permet de faire une recherche simple", content);
+
+ }
+ @Test //TIKA-1916
+ public void testMissingMeta() throws Exception {
+ String xml = getXML("testODTNoMeta.odt").xml;
+ assertContains("Test text", xml);
+ }
+}
[25/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
index 119a47b..e423871 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-public class ChmConstants {
- /* Prevents instantiation */
- private ChmConstants() {
- }
-
- public static final String DEFAULT_CHARSET = UTF_8.name();
- public static final String ITSF = "ITSF";
- public static final String ITSP = "ITSP";
- public static final String PMGL = "PMGL";
- public static final String LZXC = "LZXC";
- public static final String CHM_PMGI_MARKER = "PMGI";
- public static final int BYTE_ARRAY_LENGHT = 16;
- public static final int CHM_ITSF_V2_LEN = 0x58;
- public static final int CHM_ITSF_V3_LEN = 0x60;
- public static final int CHM_ITSP_V1_LEN = 0x54;
- public static final int CHM_PMGL_LEN = 0x14;
- public static final int CHM_PMGI_LEN = 0x08;
- public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
- public static final int CHM_LZXC_MIN_LEN = 0x18;
- public static final int CHM_LZXC_V2_LEN = 0x1c;
- public static final int CHM_SIGNATURE_LEN = 4;
- public static final int CHM_VER_2 = 2;
- public static final int CHM_VER_3 = 3;
- public static final int CHM_VER_1 = 1;
- public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
-
- /* my hacking */
- public static final int START_PMGL = 0xCC;
- public static final String CONTROL_DATA = "ControlData";
- public static final String RESET_TABLE = "ResetTable";
- public static final String CONTENT = "Content";
-
- /* some constants defined by the LZX specification */
- public static final int LZX_MIN_MATCH = 2;
- public static final int LZX_MAX_MATCH = 257;
- public static final int LZX_NUM_CHARS = 256;
- public static final int LZX_BLOCKTYPE_INVALID = 0; /*
- * also blocktypes 4-7
- * invalid
- */
- public static final int LZX_BLOCKTYPE_VERBATIM = 1;
- public static final int LZX_BLOCKTYPE_ALIGNED = 2;
- public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
- public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
- public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
- public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
- * aligned offset tree
- * #elements
- */
- public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
- * this one missing
- * from spec!
- */
- public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
- * length tree
- * #elements
- */
-
- /* LZX huffman defines: tweak tablebits as desired */
- public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
- public static final int LZX_PRETREE_TABLEBITS = 6;
- public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
- public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
- public static final int LZX_MAINTREE_TABLEBITS = 12;
- public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
- public static final int LZX_LENGTH_TABLEBITS = 12;
- public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
- public static final int LZX_ALIGNED_TABLEBITS = 7;
- public static final int LZX_LENTABLE_SAFETY = 64;
-
- public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
- 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
- 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
- 17, 17 };
-
- public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
- 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
- 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
- 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
- 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
- 1966080, 2097152 };
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class ChmConstants {
+ /* Prevents instantiation */
+ private ChmConstants() {
+ }
+
+ public static final String DEFAULT_CHARSET = UTF_8.name();
+ public static final String ITSF = "ITSF";
+ public static final String ITSP = "ITSP";
+ public static final String PMGL = "PMGL";
+ public static final String LZXC = "LZXC";
+ public static final String CHM_PMGI_MARKER = "PMGI";
+ public static final int BYTE_ARRAY_LENGHT = 16;
+ public static final int CHM_ITSF_V2_LEN = 0x58;
+ public static final int CHM_ITSF_V3_LEN = 0x60;
+ public static final int CHM_ITSP_V1_LEN = 0x54;
+ public static final int CHM_PMGL_LEN = 0x14;
+ public static final int CHM_PMGI_LEN = 0x08;
+ public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
+ public static final int CHM_LZXC_MIN_LEN = 0x18;
+ public static final int CHM_LZXC_V2_LEN = 0x1c;
+ public static final int CHM_SIGNATURE_LEN = 4;
+ public static final int CHM_VER_2 = 2;
+ public static final int CHM_VER_3 = 3;
+ public static final int CHM_VER_1 = 1;
+ public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
+
+ /* my hacking */
+ public static final int START_PMGL = 0xCC;
+ public static final String CONTROL_DATA = "ControlData";
+ public static final String RESET_TABLE = "ResetTable";
+ public static final String CONTENT = "Content";
+
+ /* some constants defined by the LZX specification */
+ public static final int LZX_MIN_MATCH = 2;
+ public static final int LZX_MAX_MATCH = 257;
+ public static final int LZX_NUM_CHARS = 256;
+ public static final int LZX_BLOCKTYPE_INVALID = 0; /*
+ * also blocktypes 4-7
+ * invalid
+ */
+ public static final int LZX_BLOCKTYPE_VERBATIM = 1;
+ public static final int LZX_BLOCKTYPE_ALIGNED = 2;
+ public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
+ public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
+ public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
+ public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
+ * aligned offset tree
+ * #elements
+ */
+ public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
+ * this one missing
+ * from spec!
+ */
+ public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
+ * length tree
+ * #elements
+ */
+
+ /* LZX huffman defines: tweak tablebits as desired */
+ public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
+ public static final int LZX_PRETREE_TABLEBITS = 6;
+ public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
+ public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
+ public static final int LZX_MAINTREE_TABLEBITS = 12;
+ public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
+ public static final int LZX_LENGTH_TABLEBITS = 12;
+ public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
+ public static final int LZX_ALIGNED_TABLEBITS = 7;
+ public static final int LZX_LENTABLE_SAFETY = 64;
+
+ public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+ 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17 };
+
+ public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
+ 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
+ 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
+ 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
+ 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
+ 1966080, 2097152 };
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
index 85f4177..454c1c4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
@@ -1,392 +1,392 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
-import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Extracts text from chm file. Enumerates chm entries.
- */
-public class ChmExtractor {
- private List<ChmLzxBlock> lzxBlocksCache = null;
- private ChmDirectoryListingSet chmDirList = null;
- private ChmItsfHeader chmItsfHeader = null;
- private ChmItspHeader chmItspHeader = null;
- private ChmLzxcResetTable chmLzxcResetTable = null;
- private ChmLzxcControlData chmLzxcControlData = null;
- private byte[] data = null;
- private int indexOfContent;
- private long lzxBlockOffset;
- private long lzxBlockLength;
-
- /**
- * Returns lzxc control data.
- *
- * @return ChmLzxcControlData
- */
- private ChmLzxcControlData getChmLzxcControlData() {
- return chmLzxcControlData;
- }
-
- /**
- * Sets lzxc control data
- *
- * @param chmLzxcControlData
- */
- private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
- this.chmLzxcControlData = chmLzxcControlData;
- }
-
- private ChmItspHeader getChmItspHeader() {
- return chmItspHeader;
- }
-
- private void setChmItspHeader(ChmItspHeader chmItspHeader) {
- this.chmItspHeader = chmItspHeader;
- }
-
- /**
- * Returns lzxc reset table
- *
- * @return ChmLzxcResetTable
- */
- private ChmLzxcResetTable getChmLzxcResetTable() {
- return chmLzxcResetTable;
- }
-
- /**
- * Sets lzxc reset table
- *
- * @param chmLzxcResetTable
- */
- private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
- this.chmLzxcResetTable = chmLzxcResetTable;
- }
-
- /**
- * Returns lzxc hit_cache length
- *
- * @return lzxBlockLength
- */
- private long getLzxBlockLength() {
- return lzxBlockLength;
- }
-
- /**
- * Sets lzxc hit_cache length
- *
- * @param lzxBlockLength
- */
- private void setLzxBlockLength(long lzxBlockLength) {
- this.lzxBlockLength = lzxBlockLength;
- }
-
- /**
- * Returns lzxc hit_cache offset
- *
- * @return lzxBlockOffset
- */
- private long getLzxBlockOffset() {
- return lzxBlockOffset;
- }
-
- /**
- * Sets lzxc hit_cache offset
- */
- private void setLzxBlockOffset(long lzxBlockOffset) {
- this.lzxBlockOffset = lzxBlockOffset;
- }
-
- private int getIndexOfContent() {
- return indexOfContent;
- }
-
- private void setIndexOfContent(int indexOfContent) {
- this.indexOfContent = indexOfContent;
- }
-
- private byte[] getData() {
- return data;
- }
-
- private void setData(byte[] data) {
- this.data = data;
- }
-
- public ChmExtractor(InputStream is) throws TikaException, IOException {
- ChmAssert.assertInputStreamNotNull(is);
- try {
- setData(IOUtils.toByteArray(is));
-
- /* Creates and parses chm itsf header */
- setChmItsfHeader(new ChmItsfHeader());
- // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
- getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
-
- /* Creates and parses chm itsp header */
- setChmItspHeader(new ChmItspHeader());
- // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
- // getChmItsfHeader().getDirOffset(),
- // (int) getChmItsfHeader().getDirOffset() +
- // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
- getChmItspHeader().parse(
- ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
- .getDirOffset(), (int) getChmItsfHeader().getDirOffset() +
- ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
-
- /* Creates instance of ChmDirListingContainer */
- setChmDirList(new ChmDirectoryListingSet(getData(),
- getChmItsfHeader(), getChmItspHeader()));
-
- int indexOfControlData = getChmDirList().getControlDataIndex();
- int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetData > 0)
- dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData
- + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
- // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
- // indexOfResetData
- // +
- // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
-
- /* Creates and parses chm control data */
- setChmLzxcControlData(new ChmLzxcControlData());
- getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
-
- int indexOfResetTable = getChmDirList().getResetTableIndex();
- setChmLzxcResetTable(new ChmLzxcResetTable());
-
- int startIndex = (int) getChmDirList().getDataOffset()
- + getChmDirList().getDirectoryListingEntryList()
- .get(indexOfResetTable).getOffset();
-
- // assert startIndex < data.length
- ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
-
- // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
- // +
- // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
- dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
- + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
-
- getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
-
- setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(),
- ChmConstants.CONTENT));
- setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset()
- + getChmItsfHeader().getDataOffset()));
- setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
-
- setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
-
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Enumerates chm entities
- *
- * @return list of chm entities
- */
- public List<String> enumerateChm() {
- List<String> listOfEntries = new ArrayList<String>();
- for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
- listOfEntries.add(directoryListingEntry.getName());
- }
- return listOfEntries;
- }
-
- /**
- * Decompresses a chm entry
- *
- * @param directoryListingEntry
- *
- * @return decompressed data
- * @throws TikaException
- */
- public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- ChmLzxBlock lzxBlock = null;
- try {
- /* UNCOMPRESSED type is easiest one */
- if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
- && directoryListingEntry.getLength() > 0
- && !ChmCommons.hasSkip(directoryListingEntry)) {
- int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
- .getOffset());
- // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
- // dataOffset + directoryListingEntry.getLength());
- buffer.write(ChmCommons.copyOfRange(
- getData(), dataOffset,
- dataOffset + directoryListingEntry.getLength()));
- } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
- && !ChmCommons.hasSkip(directoryListingEntry)) {
- /* Gets a chm hit_cache info */
- ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
- directoryListingEntry, (int) getChmLzxcResetTable()
- .getBlockLen(), getChmLzxcControlData());
-
- int i = 0, start = 0, hit_cache = 0;
-
- if ((getLzxBlockLength() < Integer.MAX_VALUE)
- && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
- // TODO: Improve the caching
- // caching ... = O(n^2) - depends on startBlock and endBlock
- start = -1;
- if (!getLzxBlocksCache().isEmpty()) {
- for (i = 0; i < getLzxBlocksCache().size(); i++) {
- //lzxBlock = getLzxBlocksCache().get(i);
- int bn = getLzxBlocksCache().get(i).getBlockNumber();
- for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
- if (bn == j) {
- if (j > start) {
- start = j;
- hit_cache = i;
- }
- }
- }
- if (start == bb.getStartBlock())
- break;
- }
- }
-
-// if (i == getLzxBlocksCache().size() && i == 0) {
- if (start<0) {
- start = bb.getIniBlock();
-
- byte[] dataSegment = ChmCommons.getChmBlockSegment(
- getData(),
- getChmLzxcResetTable(), start,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength());
-
- lzxBlock = new ChmLzxBlock(start, dataSegment,
- getChmLzxcResetTable().getBlockLen(), null);
-
- getLzxBlocksCache().add(lzxBlock);
- } else {
- lzxBlock = getLzxBlocksCache().get(hit_cache);
- }
-
- for (i = start; i <= bb.getEndBlock();) {
- if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent(
- bb.getStartOffset(), bb.getEndOffset()));
- break;
- }
-
- if (i == bb.getStartBlock()) {
- buffer.write(lzxBlock.getContent(
- bb.getStartOffset()));
- }
-
- if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent());
- }
-
- if (i == bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent(
- 0, bb.getEndOffset()));
- break;
- }
-
- i++;
-
- if (i % getChmLzxcControlData().getResetInterval() == 0) {
- lzxBlock = new ChmLzxBlock(i,
- ChmCommons.getChmBlockSegment(getData(),
- getChmLzxcResetTable(), i,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength()),
- getChmLzxcResetTable().getBlockLen(), null);
- } else {
- lzxBlock = new ChmLzxBlock(i,
- ChmCommons.getChmBlockSegment(getData(),
- getChmLzxcResetTable(), i,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength()),
- getChmLzxcResetTable().getBlockLen(),
- lzxBlock);
- }
-
- getLzxBlocksCache().add(lzxBlock);
- }
-
- if (getLzxBlocksCache().size() > getChmLzxcResetTable()
- .getBlockCount()) {
- getLzxBlocksCache().clear();
- }
- } //end of if
-
- if (buffer.size() != directoryListingEntry.getLength()) {
- throw new TikaException("CHM file extract error: extracted Length is wrong.");
- }
- } //end of if compressed
- } catch (Exception e) {
- throw new TikaException(e.getMessage());
- }
-
- return buffer.toByteArray();
- }
-
- private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
- this.lzxBlocksCache = lzxBlocksCache;
- }
-
- private List<ChmLzxBlock> getLzxBlocksCache() {
- return lzxBlocksCache;
- }
-
- private void setChmDirList(ChmDirectoryListingSet chmDirList) {
- this.chmDirList = chmDirList;
- }
-
- public ChmDirectoryListingSet getChmDirList() {
- return chmDirList;
- }
-
- private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
- this.chmItsfHeader = chmItsfHeader;
- }
-
- private ChmItsfHeader getChmItsfHeader() {
- return chmItsfHeader;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Extracts text from chm file. Enumerates chm entries.
+ */
+public class ChmExtractor {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+
+ /**
+ * Returns lzxc control data.
+ *
+ * @return ChmLzxcControlData
+ */
+ private ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ /**
+ * Sets lzxc control data
+ *
+ * @param chmLzxcControlData
+ */
+ private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ private ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ private void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ /**
+ * Returns lzxc reset table
+ *
+ * @return ChmLzxcResetTable
+ */
+ private ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ /**
+ * Sets lzxc reset table
+ *
+ * @param chmLzxcResetTable
+ */
+ private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ /**
+ * Returns lzxc hit_cache length
+ *
+ * @return lzxBlockLength
+ */
+ private long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ /**
+ * Sets lzxc hit_cache length
+ *
+ * @param lzxBlockLength
+ */
+ private void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ /**
+ * Returns lzxc hit_cache offset
+ *
+ * @return lzxBlockOffset
+ */
+ private long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ /**
+ * Sets lzxc hit_cache offset
+ */
+ private void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ private int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ private void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ private byte[] getData() {
+ return data;
+ }
+
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ public ChmExtractor(InputStream is) throws TikaException, IOException {
+ ChmAssert.assertInputStreamNotNull(is);
+ try {
+ setData(IOUtils.toByteArray(is));
+
+ /* Creates and parses chm itsf header */
+ setChmItsfHeader(new ChmItsfHeader());
+ // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+ getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+
+ /* Creates and parses chm itsp header */
+ setChmItspHeader(new ChmItspHeader());
+ // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
+ // getChmItsfHeader().getDirOffset(),
+ // (int) getChmItsfHeader().getDirOffset() +
+ // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+ getChmItspHeader().parse(
+ ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
+ .getDirOffset(), (int) getChmItsfHeader().getDirOffset() +
+ ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+
+ /* Creates instance of ChmDirListingContainer */
+ setChmDirList(new ChmDirectoryListingSet(getData(),
+ getChmItsfHeader(), getChmItspHeader()));
+
+ int indexOfControlData = getChmDirList().getControlDataIndex();
+ int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetData > 0)
+ dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData
+ + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
+ // indexOfResetData
+ // +
+ // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+
+ /* Creates and parses chm control data */
+ setChmLzxcControlData(new ChmLzxcControlData());
+ getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
+
+ int indexOfResetTable = getChmDirList().getResetTableIndex();
+ setChmLzxcResetTable(new ChmLzxcResetTable());
+
+ int startIndex = (int) getChmDirList().getDataOffset()
+ + getChmDirList().getDirectoryListingEntryList()
+ .get(indexOfResetTable).getOffset();
+
+ // assert startIndex < data.length
+ ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
+
+ // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
+ // +
+ // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+ dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
+ + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+
+ getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
+
+ setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(),
+ ChmConstants.CONTENT));
+ setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset()
+ + getChmItsfHeader().getDataOffset()));
+ setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
+
+ setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Enumerates chm entities
+ *
+ * @return list of chm entities
+ */
+ public List<String> enumerateChm() {
+ List<String> listOfEntries = new ArrayList<String>();
+ for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
+ listOfEntries.add(directoryListingEntry.getName());
+ }
+ return listOfEntries;
+ }
+
+ /**
+ * Decompresses a chm entry
+ *
+ * @param directoryListingEntry
+ *
+ * @return decompressed data
+ * @throws TikaException
+ */
+ public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ ChmLzxBlock lzxBlock = null;
+ try {
+ /* UNCOMPRESSED type is easiest one */
+ if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
+ && directoryListingEntry.getLength() > 0
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
+ .getOffset());
+ // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
+ // dataOffset + directoryListingEntry.getLength());
+ buffer.write(ChmCommons.copyOfRange(
+ getData(), dataOffset,
+ dataOffset + directoryListingEntry.getLength()));
+ } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ /* Gets a chm hit_cache info */
+ ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
+ directoryListingEntry, (int) getChmLzxcResetTable()
+ .getBlockLen(), getChmLzxcControlData());
+
+ int i = 0, start = 0, hit_cache = 0;
+
+ if ((getLzxBlockLength() < Integer.MAX_VALUE)
+ && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
+ // TODO: Improve the caching
+ // caching ... = O(n^2) - depends on startBlock and endBlock
+ start = -1;
+ if (!getLzxBlocksCache().isEmpty()) {
+ for (i = 0; i < getLzxBlocksCache().size(); i++) {
+ //lzxBlock = getLzxBlocksCache().get(i);
+ int bn = getLzxBlocksCache().get(i).getBlockNumber();
+ for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
+ if (bn == j) {
+ if (j > start) {
+ start = j;
+ hit_cache = i;
+ }
+ }
+ }
+ if (start == bb.getStartBlock())
+ break;
+ }
+ }
+
+// if (i == getLzxBlocksCache().size() && i == 0) {
+ if (start<0) {
+ start = bb.getIniBlock();
+
+ byte[] dataSegment = ChmCommons.getChmBlockSegment(
+ getData(),
+ getChmLzxcResetTable(), start,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength());
+
+ lzxBlock = new ChmLzxBlock(start, dataSegment,
+ getChmLzxcResetTable().getBlockLen(), null);
+
+ getLzxBlocksCache().add(lzxBlock);
+ } else {
+ lzxBlock = getLzxBlocksCache().get(hit_cache);
+ }
+
+ for (i = start; i <= bb.getEndBlock();) {
+ if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset(), bb.getEndOffset()));
+ break;
+ }
+
+ if (i == bb.getStartBlock()) {
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset()));
+ }
+
+ if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent());
+ }
+
+ if (i == bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent(
+ 0, bb.getEndOffset()));
+ break;
+ }
+
+ i++;
+
+ if (i % getChmLzxcControlData().getResetInterval() == 0) {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(), null);
+ } else {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(),
+ lzxBlock);
+ }
+
+ getLzxBlocksCache().add(lzxBlock);
+ }
+
+ if (getLzxBlocksCache().size() > getChmLzxcResetTable()
+ .getBlockCount()) {
+ getLzxBlocksCache().clear();
+ }
+ } //end of if
+
+ if (buffer.size() != directoryListingEntry.getLength()) {
+ throw new TikaException("CHM file extract error: extracted Length is wrong.");
+ }
+ } //end of if compressed
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage());
+ }
+
+ return buffer.toByteArray();
+ }
+
+ private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ private List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ private void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ public ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ private ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
index 03f81d3..9ed1898 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
@@ -1,147 +1,147 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm.core;
-
-import java.util.List;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-public class ChmWrapper {
- private List<ChmLzxBlock> lzxBlocksCache = null;
- private ChmDirectoryListingSet chmDirList = null;
- private ChmItsfHeader chmItsfHeader = null;
- private ChmItspHeader chmItspHeader = null;
- private ChmLzxcResetTable chmLzxcResetTable = null;
- private ChmLzxcControlData chmLzxcControlData = null;
- private byte[] data = null;
- private int indexOfContent;
- private long lzxBlockOffset;
- private long lzxBlockLength;
- private int indexOfResetData;
- private int indexOfResetTable;
- private int startIndex;
-
- protected int getStartIndex() {
- return startIndex;
- }
-
- protected void setStartIndex(int startIndex) {
- this.startIndex = startIndex;
- }
-
- protected int getIndexOfResetTable() {
- return indexOfResetTable;
- }
-
- protected void setIndexOfResetTable(int indexOfResetTable) {
- this.indexOfResetTable = indexOfResetTable;
- }
-
- protected List<ChmLzxBlock> getLzxBlocksCache() {
- return lzxBlocksCache;
- }
-
- protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
- this.lzxBlocksCache = lzxBlocksCache;
- }
-
- protected ChmDirectoryListingSet getChmDirList() {
- return chmDirList;
- }
-
- protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
- this.chmDirList = chmDirList;
- }
-
- protected ChmItsfHeader getChmItsfHeader() {
- return chmItsfHeader;
- }
-
- protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
- this.chmItsfHeader = chmItsfHeader;
- }
-
- protected ChmLzxcResetTable getChmLzxcResetTable() {
- return chmLzxcResetTable;
- }
-
- protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
- this.chmLzxcResetTable = chmLzxcResetTable;
- }
-
- protected ChmLzxcControlData getChmLzxcControlData() {
- return chmLzxcControlData;
- }
-
- protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
- this.chmLzxcControlData = chmLzxcControlData;
- }
-
- protected byte[] getData() {
- return data;
- }
-
- protected void setData(byte[] data) {
- this.data = data;
- }
-
- protected int getIndexOfContent() {
- return indexOfContent;
- }
-
- protected void setIndexOfContent(int indexOfContent) {
- this.indexOfContent = indexOfContent;
- }
-
- protected long getLzxBlockOffset() {
- return lzxBlockOffset;
- }
-
- protected void setLzxBlockOffset(long lzxBlockOffset) {
- this.lzxBlockOffset = lzxBlockOffset;
- }
-
- protected long getLzxBlockLength() {
- return lzxBlockLength;
- }
-
- protected void setLzxBlockLength(long lzxBlockLength) {
- this.lzxBlockLength = lzxBlockLength;
- }
-
- protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
- this.chmItspHeader = chmItspHeader;
- }
-
- protected ChmItspHeader getChmItspHeader() {
- return chmItspHeader;
- }
-
- protected void setIndexOfResetData(int indexOfResetData) {
- this.indexOfResetData = indexOfResetData;
- }
-
- protected int getIndexOfResetData() {
- return indexOfResetData;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm.core;
+
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+public class ChmWrapper {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+ private int indexOfResetData;
+ private int indexOfResetTable;
+ private int startIndex;
+
+ protected int getStartIndex() {
+ return startIndex;
+ }
+
+ protected void setStartIndex(int startIndex) {
+ this.startIndex = startIndex;
+ }
+
+ protected int getIndexOfResetTable() {
+ return indexOfResetTable;
+ }
+
+ protected void setIndexOfResetTable(int indexOfResetTable) {
+ this.indexOfResetTable = indexOfResetTable;
+ }
+
+ protected List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ protected ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ protected ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+
+ protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ protected ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ protected ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ protected byte[] getData() {
+ return data;
+ }
+
+ protected void setData(byte[] data) {
+ this.data = data;
+ }
+
+ protected int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ protected void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ protected long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ protected void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ protected long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ protected void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ protected ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ protected void setIndexOfResetData(int indexOfResetData) {
+ this.indexOfResetData = indexOfResetData;
+ }
+
+ protected int getIndexOfResetData() {
+ return indexOfResetData;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
index fbed908..46c522b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.exception;
-
-import org.apache.tika.exception.TikaException;
-
-public class ChmParsingException extends TikaException {
- private static final long serialVersionUID = 6497936044733665210L;
-
- public ChmParsingException(String description) {
- super(description);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.exception;
+
+import org.apache.tika.exception.TikaException;
+
+public class ChmParsingException extends TikaException {
+ private static final long serialVersionUID = 6497936044733665210L;
+
+ public ChmParsingException(String description) {
+ super(description);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
index 7f7564d..cda829c 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
@@ -1,235 +1,235 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * A container that contains chm block information such as: i. initial block is
- * using to reset main tree ii. start block is using for knowing where to start
- * iii. end block is using for knowing where to stop iv. start offset is using
- * for knowing where to start reading v. end offset is using for knowing where
- * to stop reading
- *
- */
-public class ChmBlockInfo {
- /* class members */
- private int iniBlock;
- private int startBlock;
- private int endBlock;
- private int startOffset;
- private int endOffset;
-
- private static ChmBlockInfo chmBlockInfo = null;
-
- private ChmBlockInfo() {
-
- }
-
- /**
- * Returns an information related to the chmBlockInfo
- *
- * @param dle
- * - DirectoryListingEntry
- * @param bytesPerBlock
- * - int, = chmLzxcResetTable.block_length
- * @param clcd
- * - ChmLzxcControlData
- * @param chmBlockInfo
- * - ChmBlockInfo
- *
- * @return ChmBlockInfo
- * @throws TikaException
- */
- protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
- int bytesPerBlock, ChmLzxcControlData clcd,
- ChmBlockInfo chmBlockInfo) throws TikaException {
- if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
- throw new ChmParsingException("Please check you parameters");
-
- chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
- chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
- / bytesPerBlock);
- chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
- chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
- % bytesPerBlock);
- // potential problem with casting long to int
- chmBlockInfo
- .setIniBlock(chmBlockInfo.startBlock -
- chmBlockInfo.startBlock % (int) clcd.getResetInterval());
-// .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
-// % (int) clcd.getResetInterval());
- return chmBlockInfo;
- }
-
- public static ChmBlockInfo getChmBlockInfoInstance(
- DirectoryListingEntry dle, int bytesPerBlock,
- ChmLzxcControlData clcd) {
- setChmBlockInfo(new ChmBlockInfo());
- getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
- getChmBlockInfo().setEndBlock(
- (dle.getOffset() + dle.getLength()) / bytesPerBlock);
- getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
- getChmBlockInfo().setEndOffset(
- (dle.getOffset() + dle.getLength()) % bytesPerBlock);
- // potential problem with casting long to int
- getChmBlockInfo().setIniBlock(
- getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
- % (int) clcd.getResetInterval());
-// (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
-// % (int) clcd.getResetInterval());
- return getChmBlockInfo();
- }
-
- /**
- * Returns textual representation of ChmBlockInfo
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("iniBlock:=" + getIniBlock() + ", ");
- sb.append("startBlock:=" + getStartBlock() + ", ");
- sb.append("endBlock:=" + getEndBlock() + ", ");
- sb.append("startOffset:=" + getStartOffset() + ", ");
- sb.append("endOffset:=" + getEndOffset()
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- private boolean validateParameters(DirectoryListingEntry dle,
- int bytesPerBlock, ChmLzxcControlData clcd,
- ChmBlockInfo chmBlockInfo) {
- int goodParameter = 0;
- if (dle != null)
- ++goodParameter;
- if (bytesPerBlock > 0)
- ++goodParameter;
- if (clcd != null)
- ++goodParameter;
- if (chmBlockInfo != null)
- ++goodParameter;
- return (goodParameter == 4);
- }
-
- public static void main(String[] args) {
- }
-
- /**
- * Returns an initial block index
- *
- * @return int
- */
- public int getIniBlock() {
- return iniBlock;
- }
-
- /**
- * Sets the initial block index
- *
- * @param iniBlock
- * - int
- */
- private void setIniBlock(int iniBlock) {
- this.iniBlock = iniBlock;
- }
-
- /**
- * Returns the start block index
- *
- * @return int
- */
- public int getStartBlock() {
- return startBlock;
- }
-
- /**
- * Sets the start block index
- *
- * @param startBlock
- * - int
- */
- private void setStartBlock(int startBlock) {
- this.startBlock = startBlock;
- }
-
- /**
- * Returns the end block index
- *
- * @return - int
- */
- public int getEndBlock() {
- return endBlock;
- }
-
- /**
- * Sets the end block index
- *
- * @param endBlock
- * - int
- */
- private void setEndBlock(int endBlock) {
- this.endBlock = endBlock;
- }
-
- /**
- * Returns the start offset index
- *
- * @return - int
- */
- public int getStartOffset() {
- return startOffset;
- }
-
- /**
- * Sets the start offset index
- *
- * @param startOffset
- * - int
- */
- private void setStartOffset(int startOffset) {
- this.startOffset = startOffset;
- }
-
- /**
- * Returns the end offset index
- *
- * @return - int
- */
- public int getEndOffset() {
- return endOffset;
- }
-
- /**
- * Sets the end offset index
- *
- * @param endOffset
- * - int
- */
- private void setEndOffset(int endOffset) {
- this.endOffset = endOffset;
- }
-
- public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
- ChmBlockInfo.chmBlockInfo = chmBlockInfo;
- }
-
- public static ChmBlockInfo getChmBlockInfo() {
- return chmBlockInfo;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * A container that contains chm block information such as: i. initial block is
+ * using to reset main tree ii. start block is using for knowing where to start
+ * iii. end block is using for knowing where to stop iv. start offset is using
+ * for knowing where to start reading v. end offset is using for knowing where
+ * to stop reading
+ *
+ */
+public class ChmBlockInfo {
+ /* class members */
+ private int iniBlock;
+ private int startBlock;
+ private int endBlock;
+ private int startOffset;
+ private int endOffset;
+
+ private static ChmBlockInfo chmBlockInfo = null;
+
+ private ChmBlockInfo() {
+
+ }
+
+ /**
+ * Returns an information related to the chmBlockInfo
+ *
+ * @param dle
+ * - DirectoryListingEntry
+ * @param bytesPerBlock
+ * - int, = chmLzxcResetTable.block_length
+ * @param clcd
+ * - ChmLzxcControlData
+ * @param chmBlockInfo
+ * - ChmBlockInfo
+ *
+ * @return ChmBlockInfo
+ * @throws TikaException
+ */
+ protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) throws TikaException {
+ if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
+ throw new ChmParsingException("Please check you parameters");
+
+ chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
+ chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
+ / bytesPerBlock);
+ chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
+ chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
+ % bytesPerBlock);
+ // potential problem with casting long to int
+ chmBlockInfo
+ .setIniBlock(chmBlockInfo.startBlock -
+ chmBlockInfo.startBlock % (int) clcd.getResetInterval());
+// .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
+// % (int) clcd.getResetInterval());
+ return chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfoInstance(
+ DirectoryListingEntry dle, int bytesPerBlock,
+ ChmLzxcControlData clcd) {
+ setChmBlockInfo(new ChmBlockInfo());
+ getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
+ getChmBlockInfo().setEndBlock(
+ (dle.getOffset() + dle.getLength()) / bytesPerBlock);
+ getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
+ getChmBlockInfo().setEndOffset(
+ (dle.getOffset() + dle.getLength()) % bytesPerBlock);
+ // potential problem with casting long to int
+ getChmBlockInfo().setIniBlock(
+ getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
+ % (int) clcd.getResetInterval());
+// (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
+// % (int) clcd.getResetInterval());
+ return getChmBlockInfo();
+ }
+
+ /**
+ * Returns textual representation of ChmBlockInfo
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("iniBlock:=" + getIniBlock() + ", ");
+ sb.append("startBlock:=" + getStartBlock() + ", ");
+ sb.append("endBlock:=" + getEndBlock() + ", ");
+ sb.append("startOffset:=" + getStartOffset() + ", ");
+ sb.append("endOffset:=" + getEndOffset()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ private boolean validateParameters(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) {
+ int goodParameter = 0;
+ if (dle != null)
+ ++goodParameter;
+ if (bytesPerBlock > 0)
+ ++goodParameter;
+ if (clcd != null)
+ ++goodParameter;
+ if (chmBlockInfo != null)
+ ++goodParameter;
+ return (goodParameter == 4);
+ }
+
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Returns an initial block index
+ *
+ * @return int
+ */
+ public int getIniBlock() {
+ return iniBlock;
+ }
+
+ /**
+ * Sets the initial block index
+ *
+ * @param iniBlock
+ * - int
+ */
+ private void setIniBlock(int iniBlock) {
+ this.iniBlock = iniBlock;
+ }
+
+ /**
+ * Returns the start block index
+ *
+ * @return int
+ */
+ public int getStartBlock() {
+ return startBlock;
+ }
+
+ /**
+ * Sets the start block index
+ *
+ * @param startBlock
+ * - int
+ */
+ private void setStartBlock(int startBlock) {
+ this.startBlock = startBlock;
+ }
+
+ /**
+ * Returns the end block index
+ *
+ * @return - int
+ */
+ public int getEndBlock() {
+ return endBlock;
+ }
+
+ /**
+ * Sets the end block index
+ *
+ * @param endBlock
+ * - int
+ */
+ private void setEndBlock(int endBlock) {
+ this.endBlock = endBlock;
+ }
+
+ /**
+ * Returns the start offset index
+ *
+ * @return - int
+ */
+ public int getStartOffset() {
+ return startOffset;
+ }
+
+ /**
+ * Sets the start offset index
+ *
+ * @param startOffset
+ * - int
+ */
+ private void setStartOffset(int startOffset) {
+ this.startOffset = startOffset;
+ }
+
+ /**
+ * Returns the end offset index
+ *
+ * @return - int
+ */
+ public int getEndOffset() {
+ return endOffset;
+ }
+
+ /**
+ * Sets the end offset index
+ *
+ * @param endOffset
+ * - int
+ */
+ private void setEndOffset(int endOffset) {
+ this.endOffset = endOffset;
+ }
+
+ public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
+ ChmBlockInfo.chmBlockInfo = chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfo() {
+ return chmBlockInfo;
+ }
+}
[31/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
index 53bf241..3b79f31 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
@@ -1,246 +1,246 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp3;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TailStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
- * from an MP3 file, if available.
- *
- * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
- * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
- */
-public class Mp3Parser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 8537074922934844370L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.audio("mpeg"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
- metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- // Create handlers for the various kinds of ID3 tags
- ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
-
- // Process tags metadata if the file has supported tags
- if (audioAndTags.tags.length > 0) {
- CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
-
- metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
- metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
- metadata.set(XMPDM.ARTIST, tag.getArtist());
- metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
- metadata.set(XMPDM.COMPOSER, tag.getComposer());
- metadata.set(XMPDM.ALBUM, tag.getAlbum());
- metadata.set(XMPDM.COMPILATION, tag.getCompilation());
- metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
- metadata.set(XMPDM.GENRE, tag.getGenre());
-
- List<String> comments = new ArrayList<String>();
- for (ID3Comment comment : tag.getComments()) {
- StringBuffer cmt = new StringBuffer();
- if (comment.getLanguage() != null) {
- cmt.append(comment.getLanguage());
- cmt.append(" - ");
- }
- if (comment.getDescription() != null) {
- cmt.append(comment.getDescription());
- if (comment.getText() != null) {
- cmt.append("\n");
- }
- }
- if (comment.getText() != null) {
- cmt.append(comment.getText());
- }
-
- comments.add(cmt.toString());
- metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
- }
-
- xhtml.element("h1", tag.getTitle());
- xhtml.element("p", tag.getArtist());
-
- // ID3v1.1 Track addition
- StringBuilder sb = new StringBuilder();
- sb.append(tag.getAlbum());
- if (tag.getTrackNumber() != null) {
- sb.append(", track ").append(tag.getTrackNumber());
- metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
- }
- if (tag.getDisc() != null) {
- sb.append(", disc ").append(tag.getDisc());
- metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
- }
- xhtml.element("p", sb.toString());
-
- xhtml.element("p", tag.getYear());
- xhtml.element("p", tag.getGenre());
- xhtml.element("p", String.valueOf(audioAndTags.duration));
- for (String comment : comments) {
- xhtml.element("p", comment);
- }
- }
- if (audioAndTags.duration > 0) {
- metadata.set(XMPDM.DURATION, audioAndTags.duration);
- }
- if (audioAndTags.audio != null) {
- metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
- metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
- metadata.set("version", audioAndTags.audio.getVersion());
-
- metadata.set(
- XMPDM.AUDIO_SAMPLE_RATE,
- Integer.toString(audioAndTags.audio.getSampleRate()));
- if(audioAndTags.audio.getChannels() == 1) {
- metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
- } else if(audioAndTags.audio.getChannels() == 2) {
- metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
- } else if(audioAndTags.audio.getChannels() == 5) {
- metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
- } else if(audioAndTags.audio.getChannels() == 7) {
- metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
- }
- }
- if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
- xhtml.startElement("p", "class", "lyrics");
- xhtml.characters(audioAndTags.lyrics.lyricsText);
- xhtml.endElement("p");
- }
-
- xhtml.endDocument();
- }
-
- /**
- * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
- * for each supported set of tags.
- */
- protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- ID3v24Handler v24 = null;
- ID3v23Handler v23 = null;
- ID3v22Handler v22 = null;
- ID3v1Handler v1 = null;
- LyricsHandler lyrics = null;
- AudioFrame firstAudio = null;
-
- TailStream tailStream = new TailStream(stream, 10240+128);
- MpegStream mpegStream = new MpegStream(tailStream);
-
- // ID3v2 tags live at the start of the file
- // You can apparently have several different ID3 tag blocks
- // So, keep going until we don't find any more
- MP3Frame f;
- while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
- if(f instanceof ID3v2Frame) {
- ID3v2Frame id3F = (ID3v2Frame)f;
- if (id3F.getMajorVersion() == 4) {
- v24 = new ID3v24Handler(id3F);
- } else if(id3F.getMajorVersion() == 3) {
- v23 = new ID3v23Handler(id3F);
- } else if(id3F.getMajorVersion() == 2) {
- v22 = new ID3v22Handler(id3F);
- }
- }
- }
-
- // Now iterate over all audio frames in the file
- AudioFrame frame = mpegStream.nextFrame();
- float duration = 0;
- while (frame != null)
- {
- duration += frame.getDuration();
- if (firstAudio == null)
- {
- firstAudio = frame;
- }
- mpegStream.skipFrame();
- frame = mpegStream.nextFrame();
- }
-
- // ID3v1 tags live at the end of the file
- // Lyrics live just before ID3v1, at the end of the file
- // Search for both (handlers seek to the end for us)
- lyrics = new LyricsHandler(tailStream.getTail());
- v1 = lyrics.id3v1;
-
- // Go in order of preference
- // Currently, that's newest to oldest
- List<ID3Tags> tags = new ArrayList<ID3Tags>();
-
- if(v24 != null && v24.getTagsPresent()) {
- tags.add(v24);
- }
- if(v23 != null && v23.getTagsPresent()) {
- tags.add(v23);
- }
- if(v22 != null && v22.getTagsPresent()) {
- tags.add(v22);
- }
- if(v1 != null && v1.getTagsPresent()) {
- tags.add(v1);
- }
-
- ID3TagsAndAudio ret = new ID3TagsAndAudio();
- ret.audio = firstAudio;
- ret.lyrics = lyrics;
- ret.tags = tags.toArray(new ID3Tags[tags.size()]);
- ret.duration = duration;
- return ret;
- }
-
- protected static class ID3TagsAndAudio {
- private ID3Tags[] tags;
- private AudioFrame audio;
- private LyricsHandler lyrics;
- private float duration;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TailStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
+ * from an MP3 file, if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
+ */
+public class Mp3Parser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 8537074922934844370L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.audio("mpeg"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
+ metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // Create handlers for the various kinds of ID3 tags
+ ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+ // Process tags metadata if the file has supported tags
+ if (audioAndTags.tags.length > 0) {
+ CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
+
+ metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
+ metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
+ metadata.set(XMPDM.ARTIST, tag.getArtist());
+ metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
+ metadata.set(XMPDM.COMPOSER, tag.getComposer());
+ metadata.set(XMPDM.ALBUM, tag.getAlbum());
+ metadata.set(XMPDM.COMPILATION, tag.getCompilation());
+ metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
+ metadata.set(XMPDM.GENRE, tag.getGenre());
+
+ List<String> comments = new ArrayList<String>();
+ for (ID3Comment comment : tag.getComments()) {
+ StringBuffer cmt = new StringBuffer();
+ if (comment.getLanguage() != null) {
+ cmt.append(comment.getLanguage());
+ cmt.append(" - ");
+ }
+ if (comment.getDescription() != null) {
+ cmt.append(comment.getDescription());
+ if (comment.getText() != null) {
+ cmt.append("\n");
+ }
+ }
+ if (comment.getText() != null) {
+ cmt.append(comment.getText());
+ }
+
+ comments.add(cmt.toString());
+ metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
+ }
+
+ xhtml.element("h1", tag.getTitle());
+ xhtml.element("p", tag.getArtist());
+
+ // ID3v1.1 Track addition
+ StringBuilder sb = new StringBuilder();
+ sb.append(tag.getAlbum());
+ if (tag.getTrackNumber() != null) {
+ sb.append(", track ").append(tag.getTrackNumber());
+ metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
+ }
+ if (tag.getDisc() != null) {
+ sb.append(", disc ").append(tag.getDisc());
+ metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
+ }
+ xhtml.element("p", sb.toString());
+
+ xhtml.element("p", tag.getYear());
+ xhtml.element("p", tag.getGenre());
+ xhtml.element("p", String.valueOf(audioAndTags.duration));
+ for (String comment : comments) {
+ xhtml.element("p", comment);
+ }
+ }
+ if (audioAndTags.duration > 0) {
+ metadata.set(XMPDM.DURATION, audioAndTags.duration);
+ }
+ if (audioAndTags.audio != null) {
+ metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+ metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+ metadata.set("version", audioAndTags.audio.getVersion());
+
+ metadata.set(
+ XMPDM.AUDIO_SAMPLE_RATE,
+ Integer.toString(audioAndTags.audio.getSampleRate()));
+ if(audioAndTags.audio.getChannels() == 1) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
+ } else if(audioAndTags.audio.getChannels() == 2) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
+ } else if(audioAndTags.audio.getChannels() == 5) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
+ } else if(audioAndTags.audio.getChannels() == 7) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
+ }
+ }
+ if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+ xhtml.startElement("p", "class", "lyrics");
+ xhtml.characters(audioAndTags.lyrics.lyricsText);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+
+ /**
+ * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+ * for each supported set of tags.
+ */
+ protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ ID3v24Handler v24 = null;
+ ID3v23Handler v23 = null;
+ ID3v22Handler v22 = null;
+ ID3v1Handler v1 = null;
+ LyricsHandler lyrics = null;
+ AudioFrame firstAudio = null;
+
+ TailStream tailStream = new TailStream(stream, 10240+128);
+ MpegStream mpegStream = new MpegStream(tailStream);
+
+ // ID3v2 tags live at the start of the file
+ // You can apparently have several different ID3 tag blocks
+ // So, keep going until we don't find any more
+ MP3Frame f;
+ while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
+ if(f instanceof ID3v2Frame) {
+ ID3v2Frame id3F = (ID3v2Frame)f;
+ if (id3F.getMajorVersion() == 4) {
+ v24 = new ID3v24Handler(id3F);
+ } else if(id3F.getMajorVersion() == 3) {
+ v23 = new ID3v23Handler(id3F);
+ } else if(id3F.getMajorVersion() == 2) {
+ v22 = new ID3v22Handler(id3F);
+ }
+ }
+ }
+
+ // Now iterate over all audio frames in the file
+ AudioFrame frame = mpegStream.nextFrame();
+ float duration = 0;
+ while (frame != null)
+ {
+ duration += frame.getDuration();
+ if (firstAudio == null)
+ {
+ firstAudio = frame;
+ }
+ mpegStream.skipFrame();
+ frame = mpegStream.nextFrame();
+ }
+
+ // ID3v1 tags live at the end of the file
+ // Lyrics live just before ID3v1, at the end of the file
+ // Search for both (handlers seek to the end for us)
+ lyrics = new LyricsHandler(tailStream.getTail());
+ v1 = lyrics.id3v1;
+
+ // Go in order of preference
+ // Currently, that's newest to oldest
+ List<ID3Tags> tags = new ArrayList<ID3Tags>();
+
+ if(v24 != null && v24.getTagsPresent()) {
+ tags.add(v24);
+ }
+ if(v23 != null && v23.getTagsPresent()) {
+ tags.add(v23);
+ }
+ if(v22 != null && v22.getTagsPresent()) {
+ tags.add(v22);
+ }
+ if(v1 != null && v1.getTagsPresent()) {
+ tags.add(v1);
+ }
+
+ ID3TagsAndAudio ret = new ID3TagsAndAudio();
+ ret.audio = firstAudio;
+ ret.lyrics = lyrics;
+ ret.tags = tags.toArray(new ID3Tags[tags.size()]);
+ ret.duration = duration;
+ return ret;
+ }
+
+ protected static class ID3TagsAndAudio {
+ private ID3Tags[] tags;
+ private AudioFrame audio;
+ private LyricsHandler lyrics;
+ private float duration;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
index 1a0b1b9..947b694 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -1,268 +1,268 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.video;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * <p>
- * Parser for metadata contained in Flash Videos (.flv). Resources:
- * http://osflash.org/flv and for AMF:
- * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
- * <p>
- * This parser is capable of extracting the general metadata from header as well
- * as embedded metadata.
- * <p>
- * Known keys for metadata (from file header):
- * <ol>
- * <li>hasVideo: true|false
- * <li>hasSound: true|false
- * </ol>
- * <p>
- * In addition to the above values also metadata that is inserted in to the
- * actual stream will be picked. Usually there are keys like:
- * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
- * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
- * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
- * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
- * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
- */
-public class FLVParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -8718013155719197679L;
-
- private static int TYPE_METADATA = 0x12;
- private static byte MASK_AUDIO = 1;
- private static byte MASK_VIDEO = 4;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.video("x-flv"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- private long readUInt32(DataInputStream input) throws IOException {
- return input.readInt() & 0xFFFFFFFFL;
- }
-
- private int readUInt24(DataInputStream input) throws IOException {
- int uint = input.read()<<16;
- uint += input.read()<<8;
- uint += input.read();
- return uint;
- }
-
- private Object readAMFData(DataInputStream input, int type)
- throws IOException {
- if (type == -1) {
- type = input.readUnsignedByte();
- }
- switch (type) {
- case 0:
- return input.readDouble();
- case 1:
- return input.readUnsignedByte() == 1;
- case 2:
- return readAMFString(input);
- case 3:
- return readAMFObject(input);
- case 8:
- return readAMFEcmaArray(input);
- case 10:
- return readAMFStrictArray(input);
- case 11:
- final Date date = new Date((long) input.readDouble());
- input.readShort(); // time zone
- return date;
- case 13:
- return "UNDEFINED";
- default:
- return null;
- }
- }
-
- private Object readAMFStrictArray(DataInputStream input) throws IOException {
- long count = readUInt32(input);
- ArrayList<Object> list = new ArrayList<Object>();
- for (int i = 0; i < count; i++) {
- list.add(readAMFData(input, -1));
- }
- return list;
- }
-
-
- private String readAMFString(DataInputStream input) throws IOException {
- int size = input.readUnsignedShort();
- byte[] chars = new byte[size];
- input.readFully(chars);
- return new String(chars, UTF_8);
- }
-
- private Object readAMFObject(DataInputStream input) throws IOException {
- HashMap<String, Object> array = new HashMap<String, Object>();
- while (true) {
- String key = readAMFString(input);
- int dataType = input.read();
- if (dataType == 9) { // object end marker
- break;
- }
- array.put(key, readAMFData(input, dataType));
- }
- return array;
- }
-
- private Object readAMFEcmaArray(DataInputStream input) throws IOException {
- long size = readUInt32(input);
- HashMap<String, Object> array = new HashMap<String, Object>();
- for (int i = 0; i < size; i++) {
- String key = readAMFString(input);
- int dataType = input.read();
- array.put(key, readAMFData(input, dataType));
- }
- return array;
- }
-
- private boolean checkSignature(DataInputStream fis) throws IOException {
- return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- DataInputStream datainput = new DataInputStream(stream);
- if (!checkSignature(datainput)) {
- throw new TikaException("FLV signature not detected");
- }
-
- // header
- int version = datainput.readUnsignedByte();
- if (version != 1) {
- // should be 1, perhaps this is not flv?
- throw new TikaException("Unpexpected FLV version: " + version);
- }
-
- int typeFlags = datainput.readUnsignedByte();
-
- long len = readUInt32(datainput);
- if (len != 9) {
- // we only know about format with header of 9 bytes
- throw new TikaException("Unpexpected FLV header length: " + len);
- }
-
- long sizePrev = readUInt32(datainput);
- if (sizePrev != 0) {
- // should be 0, perhaps this is not flv?
- throw new TikaException(
- "Unpexpected FLV first previous block size: " + sizePrev);
- }
-
- metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
- metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
- metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- // flv tag stream follows...
- while (true) {
- int type = datainput.read();
- if (type == -1) {
- // EOF
- break;
- }
-
- int datalen = readUInt24(datainput); //body length
- readUInt32(datainput); // timestamp
- readUInt24(datainput); // streamid
-
- if (type == TYPE_METADATA) {
- // found metadata Tag, read content to buffer
- byte[] metaBytes = new byte[datalen];
- for (int readCount = 0; readCount < datalen;) {
- int r = stream.read(metaBytes, readCount, datalen - readCount);
- if(r!=-1) {
- readCount += r;
-
- } else {
- break;
- }
- }
-
- ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
-
- DataInputStream dis = new DataInputStream(is);
-
- Object data = null;
-
- for (int i = 0; i < 2; i++) {
- data = readAMFData(dis, -1);
- }
-
- if (data instanceof Map) {
- // TODO if there are multiple metadata values with same key (in
- // separate AMF blocks, we currently loose previous values)
- Map<String, Object> extractedMetadata = (Map<String, Object>) data;
- for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
- if (entry.getValue() == null) {
- continue;
- }
- metadata.set(entry.getKey(), entry.getValue().toString());
- }
- }
-
- } else {
- // Tag was not metadata, skip over data we cannot handle
- for (int i = 0; i < datalen; i++) {
- datainput.readByte();
- }
- }
-
- sizePrev = readUInt32(datainput); // previous block size
- if (sizePrev != datalen + 11) {
- // file was corrupt or we could not parse it...
- break;
- }
- }
-
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * <p>
+ * Parser for metadata contained in Flash Videos (.flv). Resources:
+ * http://osflash.org/flv and for AMF:
+ * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
+ * <p>
+ * This parser is capable of extracting the general metadata from header as well
+ * as embedded metadata.
+ * <p>
+ * Known keys for metadata (from file header):
+ * <ol>
+ * <li>hasVideo: true|false
+ * <li>hasSound: true|false
+ * </ol>
+ * <p>
+ * In addition to the above values also metadata that is inserted in to the
+ * actual stream will be picked. Usually there are keys like:
+ * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
+ * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
+ * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
+ * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
+ * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
+ */
+public class FLVParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -8718013155719197679L;
+
+ private static int TYPE_METADATA = 0x12;
+ private static byte MASK_AUDIO = 1;
+ private static byte MASK_VIDEO = 4;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.video("x-flv"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ private long readUInt32(DataInputStream input) throws IOException {
+ return input.readInt() & 0xFFFFFFFFL;
+ }
+
+ private int readUInt24(DataInputStream input) throws IOException {
+ int uint = input.read()<<16;
+ uint += input.read()<<8;
+ uint += input.read();
+ return uint;
+ }
+
+ private Object readAMFData(DataInputStream input, int type)
+ throws IOException {
+ if (type == -1) {
+ type = input.readUnsignedByte();
+ }
+ switch (type) {
+ case 0:
+ return input.readDouble();
+ case 1:
+ return input.readUnsignedByte() == 1;
+ case 2:
+ return readAMFString(input);
+ case 3:
+ return readAMFObject(input);
+ case 8:
+ return readAMFEcmaArray(input);
+ case 10:
+ return readAMFStrictArray(input);
+ case 11:
+ final Date date = new Date((long) input.readDouble());
+ input.readShort(); // time zone
+ return date;
+ case 13:
+ return "UNDEFINED";
+ default:
+ return null;
+ }
+ }
+
+ private Object readAMFStrictArray(DataInputStream input) throws IOException {
+ long count = readUInt32(input);
+ ArrayList<Object> list = new ArrayList<Object>();
+ for (int i = 0; i < count; i++) {
+ list.add(readAMFData(input, -1));
+ }
+ return list;
+ }
+
+
+ private String readAMFString(DataInputStream input) throws IOException {
+ int size = input.readUnsignedShort();
+ byte[] chars = new byte[size];
+ input.readFully(chars);
+ return new String(chars, UTF_8);
+ }
+
+ private Object readAMFObject(DataInputStream input) throws IOException {
+ HashMap<String, Object> array = new HashMap<String, Object>();
+ while (true) {
+ String key = readAMFString(input);
+ int dataType = input.read();
+ if (dataType == 9) { // object end marker
+ break;
+ }
+ array.put(key, readAMFData(input, dataType));
+ }
+ return array;
+ }
+
+ private Object readAMFEcmaArray(DataInputStream input) throws IOException {
+ long size = readUInt32(input);
+ HashMap<String, Object> array = new HashMap<String, Object>();
+ for (int i = 0; i < size; i++) {
+ String key = readAMFString(input);
+ int dataType = input.read();
+ array.put(key, readAMFData(input, dataType));
+ }
+ return array;
+ }
+
+ private boolean checkSignature(DataInputStream fis) throws IOException {
+ return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ DataInputStream datainput = new DataInputStream(stream);
+ if (!checkSignature(datainput)) {
+ throw new TikaException("FLV signature not detected");
+ }
+
+ // header
+ int version = datainput.readUnsignedByte();
+ if (version != 1) {
+ // should be 1, perhaps this is not flv?
+ throw new TikaException("Unpexpected FLV version: " + version);
+ }
+
+ int typeFlags = datainput.readUnsignedByte();
+
+ long len = readUInt32(datainput);
+ if (len != 9) {
+ // we only know about format with header of 9 bytes
+ throw new TikaException("Unpexpected FLV header length: " + len);
+ }
+
+ long sizePrev = readUInt32(datainput);
+ if (sizePrev != 0) {
+ // should be 0, perhaps this is not flv?
+ throw new TikaException(
+ "Unpexpected FLV first previous block size: " + sizePrev);
+ }
+
+ metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
+ metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
+ metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // flv tag stream follows...
+ while (true) {
+ int type = datainput.read();
+ if (type == -1) {
+ // EOF
+ break;
+ }
+
+ int datalen = readUInt24(datainput); //body length
+ readUInt32(datainput); // timestamp
+ readUInt24(datainput); // streamid
+
+ if (type == TYPE_METADATA) {
+ // found metadata Tag, read content to buffer
+ byte[] metaBytes = new byte[datalen];
+ for (int readCount = 0; readCount < datalen;) {
+ int r = stream.read(metaBytes, readCount, datalen - readCount);
+ if(r!=-1) {
+ readCount += r;
+
+ } else {
+ break;
+ }
+ }
+
+ ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
+
+ DataInputStream dis = new DataInputStream(is);
+
+ Object data = null;
+
+ for (int i = 0; i < 2; i++) {
+ data = readAMFData(dis, -1);
+ }
+
+ if (data instanceof Map) {
+ // TODO if there are multiple metadata values with same key (in
+ // separate AMF blocks, we currently loose previous values)
+ Map<String, Object> extractedMetadata = (Map<String, Object>) data;
+ for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
+ if (entry.getValue() == null) {
+ continue;
+ }
+ metadata.set(entry.getKey(), entry.getValue().toString());
+ }
+ }
+
+ } else {
+ // Tag was not metadata, skip over data we cannot handle
+ for (int i = 0; i < datalen; i++) {
+ datainput.readByte();
+ }
+ }
+
+ sizePrev = readUInt32(datainput); // previous block size
+ if (sizePrev != datalen + 11) {
+ // file was corrupt or we could not parse it...
+ break;
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 904e536..cb2151c 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -1,21 +1,21 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tesseractPath=
+language=eng
+pageSegMode=1
+maxFileSizeToOcr=2147483647
+minFileSizeToOcr=0
timeout=120
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
index ae30df3..d35de32 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
@@ -1,75 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class AudioParserTest {
-
- @Test
- public void testWAV() throws Exception {
- String path = "/test-documents/testWAV.wav";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("44100.0", metadata.get("samplerate"));
- assertEquals("2", metadata.get("channels"));
- assertEquals("16", metadata.get("bits"));
- assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
- assertEquals("", content);
- }
-
- @Test
- public void testAIFF() throws Exception {
- String path = "/test-documents/testAIFF.aif";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("44100.0", metadata.get("samplerate"));
- assertEquals("2", metadata.get("channels"));
- assertEquals("16", metadata.get("bits"));
- assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
- assertEquals("", content);
- }
-
- @Test
- public void testAU() throws Exception {
- String path = "/test-documents/testAU.au";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("44100.0", metadata.get("samplerate"));
- assertEquals("2", metadata.get("channels"));
- assertEquals("16", metadata.get("bits"));
- assertEquals("PCM_SIGNED", metadata.get("encoding"));
-
- assertEquals("", content);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class AudioParserTest {
+
+ @Test
+ public void testWAV() throws Exception {
+ String path = "/test-documents/testWAV.wav";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+ @Test
+ public void testAIFF() throws Exception {
+ String path = "/test-documents/testAIFF.aif";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+ @Test
+ public void testAU() throws Exception {
+ String path = "/test-documents/testAU.au";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
index 9336444..344f2d7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
@@ -1,42 +1,42 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.audio;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.tika.TikaTest.assertContains;
-
-import org.apache.tika.Tika;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
-
-public class MidiParserTest {
-
- @Test
- public void testMID() throws Exception {
- String path = "/test-documents/testMID.mid";
- Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- MidiParserTest.class.getResourceAsStream(path), metadata);
-
- assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("2", metadata.get("tracks"));
- assertEquals("0", metadata.get("patches"));
- assertEquals("PPQ", metadata.get("divisionType"));
-
- assertContains("Untitled", content);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class MidiParserTest {
+
+ @Test
+ public void testMID() throws Exception {
+ String path = "/test-documents/testMID.mid";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ MidiParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2", metadata.get("tracks"));
+ assertEquals("0", metadata.get("patches"));
+ assertEquals("PPQ", metadata.get("divisionType"));
+
+ assertContains("Untitled", content);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
index 51f99db..fdac337 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
@@ -1,139 +1,139 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import java.util.Arrays;
-import java.util.GregorianCalendar;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
-
-import com.drew.metadata.Directory;
-import com.drew.metadata.MetadataException;
-import com.drew.metadata.Tag;
-import com.drew.metadata.exif.ExifIFD0Directory;
-import com.drew.metadata.exif.ExifSubIFDDirectory;
-import com.drew.metadata.jpeg.JpegCommentDirectory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class ImageMetadataExtractorTest {
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- @Test
- public void testHandleDirectories() throws MetadataException {
- Metadata metadata = mock(Metadata.class);
- ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
- ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
-
- Directory directory = new JpegCommentDirectory();
- Iterator directories = mock(Iterator.class);
- when(directories.hasNext()).thenReturn(true, false);
- when(directories.next()).thenReturn(directory);
- when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
-
- e.handle(directories);
- verify(handler1).supports(JpegCommentDirectory.class);
- verify(handler1).handle(directory, metadata);
- }
-
- @Test
- public void testExifHandlerSupports() {
- assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
- assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
- assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
- assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
- }
-
- @Test
- public void testExifHandlerParseDate() throws MetadataException {
- ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
- when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
- GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
- calendar.setTimeInMillis(0);
- calendar.set(2000, 0, 1, 0, 0, 0);
- when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
- calendar.getTime()); // jvm default timezone as in Metadata Extractor
- Metadata metadata = new Metadata();
-
- new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
- assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
- metadata.get(TikaCoreProperties.CREATED));
- }
-
- @Test
- public void testExifHandlerParseDateFallback() throws MetadataException {
- ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
- when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
- GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
- calendar.setTimeInMillis(0);
- calendar.set(1999, 0, 1, 0, 0, 0);
- when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
- calendar.getTime()); // jvm default timezone as in Metadata Extractor
- Metadata metadata = new Metadata();
-
- new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
- assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
- metadata.get(TikaCoreProperties.CREATED));
- }
-
- @Test
- public void testExifHandlerParseDateError() throws MetadataException {
- ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
- when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
- when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
- Metadata metadata = new Metadata();
-
- new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
- assertEquals("Parsing should proceed without date", null,
- metadata.get(TikaCoreProperties.CREATED));
- }
-
- @Test
- public void testCopyUnknownFieldsHandler() throws MetadataException {
- Directory d = mock(Directory.class);
- Tag t1 = mock(Tag.class);
- when(t1.getTagName()).thenReturn("Image Description");
- when(t1.getDescription()).thenReturn("t1");
- Tag t2 = mock(Tag.class);
- when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
- when(t2.getDescription()).thenReturn("known");
- Tag t3 = mock(Tag.class);
- when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
- when(t3.getDescription()).thenReturn("known");
- List<Tag> tags = Arrays.asList(t1, t2, t3);
- when(d.getTags()).thenReturn(tags);
- Metadata metadata = new Metadata();
- new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
- assertEquals("t1", metadata.get("Image Description"));
- assertNull("keywords should be excluded from bulk copy because it is a defined field",
- metadata.get(Metadata.KEYWORDS));
- assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ImageMetadataExtractorTest {
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ @Test
+ public void testHandleDirectories() throws MetadataException {
+ Metadata metadata = mock(Metadata.class);
+ ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
+ ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
+
+ Directory directory = new JpegCommentDirectory();
+ Iterator directories = mock(Iterator.class);
+ when(directories.hasNext()).thenReturn(true, false);
+ when(directories.next()).thenReturn(directory);
+ when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
+
+ e.handle(directories);
+ verify(handler1).supports(JpegCommentDirectory.class);
+ verify(handler1).handle(directory, metadata);
+ }
+
+ @Test
+ public void testExifHandlerSupports() {
+ assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
+ assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
+ }
+
+ @Test
+ public void testExifHandlerParseDate() throws MetadataException {
+ ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
+ when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+ calendar.setTimeInMillis(0);
+ calendar.set(2000, 0, 1, 0, 0, 0);
+ when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
+ calendar.getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testExifHandlerParseDateFallback() throws MetadataException {
+ ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+ when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
+ GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+ calendar.setTimeInMillis(0);
+ calendar.set(1999, 0, 1, 0, 0, 0);
+ when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
+ calendar.getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testExifHandlerParseDateError() throws MetadataException {
+ ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+ when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Parsing should proceed without date", null,
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testCopyUnknownFieldsHandler() throws MetadataException {
+ Directory d = mock(Directory.class);
+ Tag t1 = mock(Tag.class);
+ when(t1.getTagName()).thenReturn("Image Description");
+ when(t1.getDescription()).thenReturn("t1");
+ Tag t2 = mock(Tag.class);
+ when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
+ when(t2.getDescription()).thenReturn("known");
+ Tag t3 = mock(Tag.class);
+ when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
+ when(t3.getDescription()).thenReturn("known");
+ List<Tag> tags = Arrays.asList(t1, t2, t3);
+ when(d.getTags()).thenReturn(tags);
+ Metadata metadata = new Metadata();
+ new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
+ assertEquals("t1", metadata.get("Image Description"));
+ assertNull("keywords should be excluded from bulk copy because it is a defined field",
+ metadata.get(Metadata.KEYWORDS));
+ assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+ }
+
+}
[10/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 01dd436..30f9c98 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -1,192 +1,192 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.tika.Tika;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends AbstractPkgTest {
-
- @Test
- public void testZipParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testEXCEL.xls", content);
- assertContains("testHTML.html", content);
- assertContains("testOpenOffice2.odt", content);
- assertContains("testPDF.pdf", content);
- assertContains("testPPT.ppt", content);
- assertContains("testRTF.rtf", content);
- assertContains("testTXT.txt", content);
- assertContains("testWORD.doc", content);
- assertContains("testXML.xml", content);
- }
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- parser.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found all 9 documents
- assertEquals(9, tracker.filenames.size());
- assertEquals(9, tracker.mediatypes.size());
- assertEquals(9, tracker.modifiedAts.size());
-
- // Should have names and modified dates, but not content types,
- // as zip doesn't store the content types
- assertEquals("testEXCEL.xls", tracker.filenames.get(0));
- assertEquals("testHTML.html", tracker.filenames.get(1));
- assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
- assertEquals("testPDF.pdf", tracker.filenames.get(3));
- assertEquals("testPPT.ppt", tracker.filenames.get(4));
- assertEquals("testRTF.rtf", tracker.filenames.get(5));
- assertEquals("testTXT.txt", tracker.filenames.get(6));
- assertEquals("testWORD.doc", tracker.filenames.get(7));
- assertEquals("testXML.xml", tracker.filenames.get(8));
-
- for(String type : tracker.mediatypes) {
- assertNull(type);
- }
- for(String crt : tracker.createdAts) {
- assertNull(crt);
- }
- for(String mod : tracker.modifiedAts) {
- assertNotNull(mod);
- assertTrue("Modified at " + mod, mod.startsWith("20"));
- }
- }
-
- /**
- * Test case for the ability of the ZIP parser to extract the name of
- * a ZIP entry even if the content of the entry is unreadable due to an
- * unsupported compression method.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
- */
- @Test
- public void testUnsupportedZipCompressionMethod() throws Exception {
- String content = new Tika().parseToString(
- ZipParserTest.class.getResourceAsStream(
- "/test-documents/moby.zip"));
- assertContains("README", content);
- }
-
- private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
- public Set<String> allRelIDs = new HashSet<String>();
- public boolean shouldParseEmbedded(Metadata metadata) {
- String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
- if (relID != null) {
- allRelIDs.add(relID);
- }
- return false;
- }
-
- public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
- throw new UnsupportedOperationException("should never be called");
- }
- }
-
- // TIKA-1036
- @Test
- public void testPlaceholders() throws Exception {
- String xml = getXML("testEmbedded.zip").xml;
- assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
- assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
-
- // Also make sure EMBEDDED_RELATIONSHIP_ID was
- // passed when parsing the embedded docs:
- Parser parser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
- context.set(EmbeddedDocumentExtractor.class, relIDs);
- try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
- parser.parse(input,
- new BodyContentHandler(),
- new Metadata(),
- context);
- }
-
- assertTrue(relIDs.allRelIDs.contains("test1.txt"));
- assertTrue(relIDs.allRelIDs.contains("test2.txt"));
- }
-
- @Test // TIKA-936
- public void testCustomEncoding() throws Exception {
- ArchiveStreamFactory factory = new ArchiveStreamFactory();
- factory.setEntryEncoding("SJIS");
- trackingContext.set(ArchiveStreamFactory.class, factory);
-
- try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
- "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
- + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
- + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
- + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
- autoDetectParser.parse(
- stream, new DefaultHandler(),
- new Metadata(), trackingContext);
- }
-
- assertEquals(1, tracker.filenames.size());
- assertEquals(
- "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
- tracker.filenames.get(0));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.tika.Tika;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Test case for parsing zip files.
+ */
+public class ZipParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("testHTML.html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("testTXT.txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names and modified dates, but not content types,
+ // as zip doesn't store the content types
+ assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("testHTML.html", tracker.filenames.get(1));
+ assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("testTXT.txt", tracker.filenames.get(6));
+ assertEquals("testWORD.doc", tracker.filenames.get(7));
+ assertEquals("testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+
+ /**
+ * Test case for the ability of the ZIP parser to extract the name of
+ * a ZIP entry even if the content of the entry is unreadable due to an
+ * unsupported compression method.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
+ */
+ @Test
+ public void testUnsupportedZipCompressionMethod() throws Exception {
+ String content = new Tika().parseToString(
+ ZipParserTest.class.getResourceAsStream(
+ "/test-documents/moby.zip"));
+ assertContains("README", content);
+ }
+
+ private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
+ public Set<String> allRelIDs = new HashSet<String>();
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+ if (relID != null) {
+ allRelIDs.add(relID);
+ }
+ return false;
+ }
+
+ public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
+ throw new UnsupportedOperationException("should never be called");
+ }
+ }
+
+ // TIKA-1036
+ @Test
+ public void testPlaceholders() throws Exception {
+ String xml = getXML("testEmbedded.zip").xml;
+ assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+ assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
+
+ // Also make sure EMBEDDED_RELATIONSHIP_ID was
+ // passed when parsing the embedded docs:
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
+ context.set(EmbeddedDocumentExtractor.class, relIDs);
+ try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
+ parser.parse(input,
+ new BodyContentHandler(),
+ new Metadata(),
+ context);
+ }
+
+ assertTrue(relIDs.allRelIDs.contains("test1.txt"));
+ assertTrue(relIDs.allRelIDs.contains("test2.txt"));
+ }
+
+ @Test // TIKA-936
+ public void testCustomEncoding() throws Exception {
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
+ factory.setEntryEncoding("SJIS");
+ trackingContext.set(ArchiveStreamFactory.class, factory);
+
+ try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
+ "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
+ + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
+ + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
+ + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
+ autoDetectParser.parse(
+ stream, new DefaultHandler(),
+ new Metadata(), trackingContext);
+ }
+
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(
+ "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
+ tracker.filenames.get(0));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index 11f259e..568303c 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -1,126 +1,126 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-pdf-module</artifactId>
- <name>Apache Tika parser pdf module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <commons.logging.version>1.1.3</commons.logging.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-multimedia-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-xmp-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox-tools</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>jempbox</artifactId>
- <version>${jempbox.version}</version>
- </dependency>
- <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
- as optional, but we prefer to have them always to avoid
- problems with encrypted PDFs. -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcmail-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <!-- Copied from PDFBox:
- For legal reasons (incompatible license), jai-imageio-core is to be used
- only in the tests and may not be distributed. See also LEGAL-195 -->
- <dependency>
- <groupId>com.github.jai-imageio</groupId>
- <artifactId>jai-imageio-core</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-pdf-module</artifactId>
+ <name>Apache Tika parser pdf module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-multimedia-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox-tools</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jempbox</artifactId>
+ <version>${jempbox.version}</version>
+ </dependency>
+ <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+ as optional, but we prefer to have them always to avoid
+ problems with encrypted PDFs. -->
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <!-- Copied from PDFBox:
+ For legal reasons (incompatible license), jai-imageio-core is to be used
+ only in the tests and may not be distributed. See also LEGAL-195 -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
index 9860934..d38a96d 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pdf.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pdf.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/pom.xml b/tika-parser-modules/tika-parser-scientific-module/pom.xml
index 7afe2d6..1b3eb96 100644
--- a/tika-parser-modules/tika-parser-scientific-module/pom.xml
+++ b/tika-parser-modules/tika-parser-scientific-module/pom.xml
@@ -1,136 +1,136 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-scientific-module</artifactId>
- <name>Apache Tika parser scientific module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <netcdf-java.version>4.5.5</netcdf-java.version>
- <sis.version>0.6</sis.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-exec</artifactId>
- <version>${commons.exec}</version>
- </dependency>
- <dependency>
- <groupId>com.googlecode.json-simple</groupId>
- <artifactId>json-simple</artifactId>
- <version>1.1.1</version>
- <exclusions>
- <exclusion>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-utility</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.storage</groupId>
- <artifactId>sis-netcdf</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-metadata</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <!-- edu.ucar dependencies -->
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>netcdf4</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>grib</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>cdm</artifactId>
- <version>${netcdf-java.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>httpservices</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <!-- Apache cTAKES -->
- <dependency>
- <groupId>org.apache.ctakes</groupId>
- <artifactId>ctakes-core</artifactId>
- <version>3.2.2</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <!-- Upstream parser libraries -->
- <dependency>
- <groupId>net.sourceforge.jmatio</groupId>
- <artifactId>jmatio</artifactId>
- <version>1.0</version>
- </dependency>
- <!-- Apache Commons CSV -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-csv</artifactId>
- <version>1.0</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-scientific-module</artifactId>
+ <name>Apache Tika parser scientific module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <netcdf-java.version>4.5.5</netcdf-java.version>
+ <sis.version>0.6</sis.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-exec</artifactId>
+ <version>${commons.exec}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.json-simple</groupId>
+ <artifactId>json-simple</artifactId>
+ <version>1.1.1</version>
+ <exclusions>
+ <exclusion>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.core</groupId>
+ <artifactId>sis-utility</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.storage</groupId>
+ <artifactId>sis-netcdf</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.core</groupId>
+ <artifactId>sis-metadata</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <!-- edu.ucar dependencies -->
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>netcdf4</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>grib</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>cdm</artifactId>
+ <version>${netcdf-java.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>httpservices</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <!-- Apache cTAKES -->
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <!-- Upstream parser libraries -->
+ <dependency>
+ <groupId>net.sourceforge.jmatio</groupId>
+ <artifactId>jmatio</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <!-- Apache Commons CSV -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
index 0195b63..741b64e 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.scientific.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.scientific.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
index 0a3121b..821493b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
@@ -1,122 +1,122 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.netcdf.NetCDFParser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import ucar.nc2.Attribute;
-import ucar.nc2.Group;
-import ucar.nc2.NetcdfFile;
-
-/**
- *
- * Since the {@link NetCDFParser} depends on the <a
- * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
- * we are able to use it to parse HDF files as well. See <a href=
- * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
- * >this link</a> for more information.
- */
-public class HDFParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 1091208208003437549L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("x-hdf"));
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
- * .tika.parser.ParseContext)
- */
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
- * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
- * org.apache.tika.parser.ParseContext)
- */
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- IOUtils.copy(stream, os);
-
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name == null) {
- name = "";
- }
- try {
- NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
- unravelStringMet(ncFile, null, metadata);
- } catch (IOException e) {
- throw new TikaException("HDF parse error", e);
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
- protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
- if (group == null) {
- group = ncFile.getRootGroup();
- }
-
- // get file type
- met.set("File-Type-Description", ncFile.getFileTypeDescription());
- // unravel its string attrs
- for (Attribute attribute : group.getAttributes()) {
- if (attribute.isString()) {
- met.add(attribute.getFullName(), attribute.getStringValue());
- } else {
- // try and cast its value to a string
- met.add(attribute.getFullName(), String.valueOf(attribute
- .getNumericValue()));
- }
- }
-
- for (Group g : group.getGroups()) {
- unravelStringMet(ncFile, g, met);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.netcdf.NetCDFParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import ucar.nc2.Attribute;
+import ucar.nc2.Group;
+import ucar.nc2.NetcdfFile;
+
+/**
+ *
+ * Since the {@link NetCDFParser} depends on the <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
+ * we are able to use it to parse HDF files as well. See <a href=
+ * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
+ * >this link</a> for more information.
+ */
+public class HDFParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 1091208208003437549L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-hdf"));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
+ * .tika.parser.ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ IOUtils.copy(stream, os);
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name == null) {
+ name = "";
+ }
+ try {
+ NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+ unravelStringMet(ncFile, null, metadata);
+ } catch (IOException e) {
+ throw new TikaException("HDF parse error", e);
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
+ if (group == null) {
+ group = ncFile.getRootGroup();
+ }
+
+ // get file type
+ met.set("File-Type-Description", ncFile.getFileTypeDescription());
+ // unravel its string attrs
+ for (Attribute attribute : group.getAttributes()) {
+ if (attribute.isString()) {
+ met.add(attribute.getFullName(), attribute.getStringValue());
+ } else {
+ // try and cast its value to a string
+ met.add(attribute.getFullName(), String.valueOf(attribute
+ .getNumericValue()));
+ }
+ }
+
+ for (Group g : group.getGroups()) {
+ unravelStringMet(ncFile, g, met);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
index 1ee4dc7..d54754b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
@@ -1,72 +1,72 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.TikaTest;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- *
- * Test suite for the {@link HDFParser}.
- *
- */
-public class HDFParserTest extends TikaTest {
-
- @Test
- public void testParseGlobalMetadata() throws Exception {
- if(System.getProperty("java.version").startsWith("1.5")) {
- return;
- }
- /*
- * this is a publicly available HDF5 file from the MLS mission:
- *
- *
- * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
- * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
- */
-
- XMLResult r = getXML("test.he5", new HDFParser());
- assertNotNull(r.metadata);
- assertEquals("5", r.metadata.get("GranuleMonth"));
- }
-
- @Test
- public void testHDF4() throws Exception {
- if(System.getProperty("java.version").startsWith("1.5")) {
- return;
- }
-
- /*
- * this is a publicly available HDF4 file from the HD4 examples:
- *
- * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
- */
- XMLResult r = getXML("test.hdf", new HDFParser());
- assertNotNull(r.metadata);
- assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
- assertEquals("Ascending", r.metadata.get("Pass"));
- assertEquals("Hierarchical Data Format, version 4",
- r.metadata.get("File-Type-Description"));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ *
+ * Test suite for the {@link HDFParser}.
+ *
+ */
+public class HDFParserTest extends TikaTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+ /*
+ * this is a publicly available HDF5 file from the MLS mission:
+ *
+ *
+ * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
+ * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
+ */
+
+ XMLResult r = getXML("test.he5", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("5", r.metadata.get("GranuleMonth"));
+ }
+
+ @Test
+ public void testHDF4() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+
+ /*
+ * this is a publicly available HDF4 file from the HD4 examples:
+ *
+ * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
+ */
+ XMLResult r = getXML("test.hdf", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
+ assertEquals("Ascending", r.metadata.get("Pass"));
+ assertEquals("Hierarchical Data Format, version 4",
+ r.metadata.get("File-Type-Description"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
index 7d0f2e8..77a8cc8 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
@@ -1,61 +1,61 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.netcdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- * Test cases to exercise the {@link NetCDFParser}.
- */
-public class NetCDFParserTest extends TikaTest {
-
- @Test
- public void testParseGlobalMetadata() throws Exception {
-
- XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
- assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
- "model output prepared for IPCC AR4");
- assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
- assertEquals(r.metadata.get(Metadata.PROJECT_ID),
- "IPCC Fourth Assessment");
- assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
- assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
- assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
- "720 ppm stabilization experiment (SRESA1B)");
- assertEquals(r.metadata.get("File-Type-Description"),
- "NetCDF-3/CDM");
-
- assertContains("long_name = \"Surface area\"", r.xml);
- assertContains("float area(lat=128, lon=256)", r.xml);
- assertContains("float lat(lat=128)", r.xml);
- assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
- assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
-
-
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ */
+public class NetCDFParserTest extends TikaTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+
+ XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
+ assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
+ "model output prepared for IPCC AR4");
+ assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+ assertEquals(r.metadata.get(Metadata.PROJECT_ID),
+ "IPCC Fourth Assessment");
+ assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+ assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
+ assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
+ "720 ppm stabilization experiment (SRESA1B)");
+ assertEquals(r.metadata.get("File-Type-Description"),
+ "NetCDF-3/CDM");
+
+ assertContains("long_name = \"Surface area\"", r.xml);
+ assertContains("float area(lat=128, lon=256)", r.xml);
+ assertContains("float lat(lat=128)", r.xml);
+ assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
+ assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
+
+
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/pom.xml b/tika-parser-modules/tika-parser-text-module/pom.xml
index 1389d08..aca729b 100644
--- a/tika-parser-modules/tika-parser-text-module/pom.xml
+++ b/tika-parser-modules/tika-parser-text-module/pom.xml
@@ -1,67 +1,67 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-text-module</artifactId>
- <name>Apache Tika parser text module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <commons.logging.version>1.1.3</commons.logging.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>com.googlecode.juniversalchardet</groupId>
- <artifactId>juniversalchardet</artifactId>
- <version>1.0.3</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>${codec.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-text-module</artifactId>
+ <name>Apache Tika parser text module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.juniversalchardet</groupId>
+ <artifactId>juniversalchardet</artifactId>
+ <version>1.0.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
index 80716d8..59836c6 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
@@ -1,20 +1,20 @@
-package org.apache.tika.module.text.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+package org.apache.tika.module.text.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
[19/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
index 20458bc..4c2bdfd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java
@@ -1,125 +1,125 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests major functionality of ChmBlockInfo
- *
- */
-public class TestChmBlockInfo {
- private byte[] data;
- private ChmBlockInfo chmBlockInfo;
- private ChmDirectoryListingSet chmDirListCont = null;
- private ChmLzxcResetTable clrt = null;
- private ChmLzxcControlData chmLzxcControlData = null;
-
- @Before
- public void setUp() throws Exception {
- data = TestParameters.chmData;
- /* Creates and parses itsf header */
- ChmItsfHeader chmItsHeader = new ChmItsfHeader();
- // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- /* Creates and parses itsp block */
- ChmItspHeader chmItspHeader = new ChmItspHeader();
- // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
- // chmItsHeader.getDirOffset(),
- // (int) chmItsHeader.getDirOffset()
- // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- chmItspHeader.parse(ChmCommons.copyOfRange(data,
- (int) chmItsHeader.getDirOffset(),
- (int) chmItsHeader.getDirOffset()
- + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- /* Creating instance of ChmDirListingContainer */
- chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
- chmItspHeader);
- int indexOfControlData = chmDirListCont.getControlDataIndex();
-
- int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetTable > 0) {
- // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
- // indexOfResetTable
- // +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
- dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
- indexOfResetTable
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfControlData).getLength());
- }
-
- /* Creates and parses control block */
- chmLzxcControlData = new ChmLzxcControlData();
- chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
- int indexOfFeList = chmDirListCont.getResetTableIndex();
- int startIndex = (int) chmDirListCont.getDataOffset()
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfFeList).getOffset();
- // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
- dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
- + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
- clrt = new ChmLzxcResetTable();
- clrt.parse(dir_chunk, clrt);
- }
-
- @Test
- public void testToString() {
- if (chmBlockInfo == null)
- testGetChmBlockInfo();
- assertTrue(chmBlockInfo.toString().length() > 0);
- }
-
- @Test
- public void testGetChmBlockInfo() {
- for (DirectoryListingEntry directoryListingEntry : chmDirListCont.getDirectoryListingEntryList()) {
- chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
- directoryListingEntry, (int) clrt.getBlockLen(),
- chmLzxcControlData);
- // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
- // chmBlockInfo.toString() != null);
- assertTrue(!ChmCommons.isEmpty(directoryListingEntry
- .getName()) && chmBlockInfo.toString() != null);
- }
- }
-
- @After
- public void tearDown() throws Exception {
- data = null;
- chmBlockInfo = null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests major functionality of ChmBlockInfo
+ *
+ */
+public class TestChmBlockInfo {
+ private byte[] data;
+ private ChmBlockInfo chmBlockInfo;
+ private ChmDirectoryListingSet chmDirListCont = null;
+ private ChmLzxcResetTable clrt = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+
+ @Before
+ public void setUp() throws Exception {
+ data = TestParameters.chmData;
+ /* Creates and parses itsf header */
+ ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+ // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ /* Creates and parses itsp block */
+ ChmItspHeader chmItspHeader = new ChmItspHeader();
+ // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+ // chmItsHeader.getDirOffset(),
+ // (int) chmItsHeader.getDirOffset()
+ // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ chmItspHeader.parse(ChmCommons.copyOfRange(data,
+ (int) chmItsHeader.getDirOffset(),
+ (int) chmItsHeader.getDirOffset()
+ + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ /* Creating instance of ChmDirListingContainer */
+ chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
+ chmItspHeader);
+ int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+ int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetTable > 0) {
+ // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+ // indexOfResetTable
+ // +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+ indexOfResetTable
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfControlData).getLength());
+ }
+
+ /* Creates and parses control block */
+ chmLzxcControlData = new ChmLzxcControlData();
+ chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+ int indexOfFeList = chmDirListCont.getResetTableIndex();
+ int startIndex = (int) chmDirListCont.getDataOffset()
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfFeList).getOffset();
+ // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+ dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
+ + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
+ clrt = new ChmLzxcResetTable();
+ clrt.parse(dir_chunk, clrt);
+ }
+
+ @Test
+ public void testToString() {
+ if (chmBlockInfo == null)
+ testGetChmBlockInfo();
+ assertTrue(chmBlockInfo.toString().length() > 0);
+ }
+
+ @Test
+ public void testGetChmBlockInfo() {
+ for (DirectoryListingEntry directoryListingEntry : chmDirListCont.getDirectoryListingEntryList()) {
+ chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
+ directoryListingEntry, (int) clrt.getBlockLen(),
+ chmLzxcControlData);
+ // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
+ // chmBlockInfo.toString() != null);
+ assertTrue(!ChmCommons.isEmpty(directoryListingEntry
+ .getName()) && chmBlockInfo.toString() != null);
+ }
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ data = null;
+ chmBlockInfo = null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
index 5f53870..229277d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
@@ -1,212 +1,212 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.SAXException;
-
-public class TestChmExtraction extends TikaTest {
-
- private final Parser parser = new ChmParser();
-
- private final List<String> files = Arrays.asList(
- "/test-documents/testChm.chm",
- "/test-documents/testChm2.chm",
- "/test-documents/testChm3.chm");
-
- @Test
- public void testGetText() throws Exception {
- BodyContentHandler handler = new BodyContentHandler();
- new ChmParser().parse(
- new ByteArrayInputStream(TestParameters.chmData),
- handler, new Metadata(), new ParseContext());
- assertTrue(handler.toString().contains(
- "The TCard method accepts only numeric arguments"));
- }
-
- @Test
- public void testChmParser() throws Exception{
- for (String fileName : files) {
- InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName);
- testingChm(stream);
- }
- }
-
- private void testingChm(InputStream stream) throws IOException, SAXException, TikaException {
- try {
- BodyContentHandler handler = new BodyContentHandler(-1);
- parser.parse(stream, handler, new Metadata(), new ParseContext());
- assertTrue(!handler.toString().isEmpty());
- } finally {
- stream.close();
- }
- }
-
- @Test
- public void testExtractChmEntries() throws TikaException, IOException{
- for (String fileName : files) {
- try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) {
- testExtractChmEntry(stream);
- }
- }
- }
-
- protected boolean findZero(byte[] textData) {
- for (byte b : textData) {
- if (b==0) {
- return true;
- }
- }
-
- return false;
- }
-
- protected boolean niceAscFileName(String name) {
- for (char c : name.toCharArray()) {
- if (c>=127 || c<32) {
- //non-ascii char or control char
- return false;
- }
- }
-
- return true;
- }
-
- protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{
- ChmExtractor chmExtractor = new ChmExtractor(stream);
- ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
- final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
- , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
-
- Set<String> names = new HashSet<String>();
-
- for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
- byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
-
- //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
- if (! niceAscFileName(directoryListingEntry.getName())) {
- throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
- }
-
- final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
-
- //check duplicate entry name which is seen before.
- if (names.contains(lowName)) {
- throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
- }
- names.add(lowName);
-
- if (lowName.endsWith(".html")
- || lowName.endsWith(".htm")
- || lowName.endsWith(".hhk")
- || lowName.endsWith(".hhc")
- //|| name.endsWith(".bmp")
- ) {
- if (findZero(data)) {
- throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
- }
-
- //validate html
- String html = new String(data, ISO_8859_1);
- if (! htmlPairP.matcher(html).find()) {
- System.err.println(lowName + " is invalid.");
- System.err.println(html);
- throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
- }
-// else {
-// System.err.println(directoryListingEntry.getName() + " is valid.");
-// }
- }
- }
- }
-
-
- @Test
- public void testMultiThreadedChmExtraction() throws InterruptedException {
- ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
- for (int i = 0; i < TestParameters.NTHREADS; i++) {
- executor.execute(new Runnable() {
- public void run() {
- for (String fileName : files) {
- InputStream stream = null;
- try {
- stream = TestChmExtraction.class.getResourceAsStream(fileName);
- BodyContentHandler handler = new BodyContentHandler(-1);
- parser.parse(stream, handler, new Metadata(), new ParseContext());
- assertTrue(!handler.toString().isEmpty());
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- try {
- stream.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- });
- }
- executor.shutdown();
- // Waits until all threads will have finished
- while (!executor.isTerminated()) {
- Thread.sleep(500);
- }
- }
-
- @Test
- public void test_TIKA_1446() throws Exception {
- String[] chemFiles = {
- "admin.chm",
- "cmak_ops.CHM",
- "comexp.CHM",
- "gpedit.CHM",
- "IMJPCL.CHM",
- "IMJPCLE.CHM",
- "IMTCEN.CHM",
- "tcpip.CHM",
- "wmicontrol.CHM"
- };
- for (String fileName : chemFiles) {
- testingChm(getTestDocumentAsStream("chm/"+fileName));
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class TestChmExtraction extends TikaTest {
+
+ private final Parser parser = new ChmParser();
+
+ private final List<String> files = Arrays.asList(
+ "/test-documents/testChm.chm",
+ "/test-documents/testChm2.chm",
+ "/test-documents/testChm3.chm");
+
+ @Test
+ public void testGetText() throws Exception {
+ BodyContentHandler handler = new BodyContentHandler();
+ new ChmParser().parse(
+ new ByteArrayInputStream(TestParameters.chmData),
+ handler, new Metadata(), new ParseContext());
+ assertTrue(handler.toString().contains(
+ "The TCard method accepts only numeric arguments"));
+ }
+
+ @Test
+ public void testChmParser() throws Exception{
+ for (String fileName : files) {
+ InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName);
+ testingChm(stream);
+ }
+ }
+
+ private void testingChm(InputStream stream) throws IOException, SAXException, TikaException {
+ try {
+ BodyContentHandler handler = new BodyContentHandler(-1);
+ parser.parse(stream, handler, new Metadata(), new ParseContext());
+ assertTrue(!handler.toString().isEmpty());
+ } finally {
+ stream.close();
+ }
+ }
+
+ @Test
+ public void testExtractChmEntries() throws TikaException, IOException{
+ for (String fileName : files) {
+ try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) {
+ testExtractChmEntry(stream);
+ }
+ }
+ }
+
+ protected boolean findZero(byte[] textData) {
+ for (byte b : textData) {
+ if (b==0) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ protected boolean niceAscFileName(String name) {
+ for (char c : name.toCharArray()) {
+ if (c>=127 || c<32) {
+ //non-ascii char or control char
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{
+ ChmExtractor chmExtractor = new ChmExtractor(stream);
+ ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+ final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
+ , Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
+
+ Set<String> names = new HashSet<String>();
+
+ for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
+ byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
+
+ //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
+ if (! niceAscFileName(directoryListingEntry.getName())) {
+ throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
+ }
+
+ final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
+
+ //check duplicate entry name which is seen before.
+ if (names.contains(lowName)) {
+ throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
+ }
+ names.add(lowName);
+
+ if (lowName.endsWith(".html")
+ || lowName.endsWith(".htm")
+ || lowName.endsWith(".hhk")
+ || lowName.endsWith(".hhc")
+ //|| name.endsWith(".bmp")
+ ) {
+ if (findZero(data)) {
+ throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
+ }
+
+ //validate html
+ String html = new String(data, ISO_8859_1);
+ if (! htmlPairP.matcher(html).find()) {
+ System.err.println(lowName + " is invalid.");
+ System.err.println(html);
+ throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
+ }
+// else {
+// System.err.println(directoryListingEntry.getName() + " is valid.");
+// }
+ }
+ }
+ }
+
+
+ @Test
+ public void testMultiThreadedChmExtraction() throws InterruptedException {
+ ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
+ for (int i = 0; i < TestParameters.NTHREADS; i++) {
+ executor.execute(new Runnable() {
+ public void run() {
+ for (String fileName : files) {
+ InputStream stream = null;
+ try {
+ stream = TestChmExtraction.class.getResourceAsStream(fileName);
+ BodyContentHandler handler = new BodyContentHandler(-1);
+ parser.parse(stream, handler, new Metadata(), new ParseContext());
+ assertTrue(!handler.toString().isEmpty());
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ });
+ }
+ executor.shutdown();
+ // Waits until all threads will have finished
+ while (!executor.isTerminated()) {
+ Thread.sleep(500);
+ }
+ }
+
+ @Test
+ public void test_TIKA_1446() throws Exception {
+ String[] chemFiles = {
+ "admin.chm",
+ "cmak_ops.CHM",
+ "comexp.CHM",
+ "gpedit.CHM",
+ "IMJPCL.CHM",
+ "IMJPCLE.CHM",
+ "IMTCEN.CHM",
+ "tcpip.CHM",
+ "wmicontrol.CHM"
+ };
+ for (String fileName : chemFiles) {
+ testingChm(getTestDocumentAsStream("chm/"+fileName));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
index 4301240..c072db0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
@@ -1,63 +1,63 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.util.List;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmExtractor {
- private ChmExtractor chmExtractor = null;
-
- @Before
- public void setUp() throws Exception {
- chmExtractor = new ChmExtractor(
- new ByteArrayInputStream(TestParameters.chmData));
- }
-
- @Test
- public void testEnumerateChm() {
- List<String> chmEntries = chmExtractor.enumerateChm();
- assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
- chmEntries.size());
- }
-
- @Test
- public void testGetChmDirList() {
- assertNotNull(chmExtractor.getChmDirList());
- }
-
- @Test
- public void testExtractChmEntry() throws TikaException{
- ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
-
- int count = 0;
- for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
- chmExtractor.extractChmEntry(directoryListingEntry);
- ++count;
- }
- assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.util.List;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmExtractor {
+ private ChmExtractor chmExtractor = null;
+
+ @Before
+ public void setUp() throws Exception {
+ chmExtractor = new ChmExtractor(
+ new ByteArrayInputStream(TestParameters.chmData));
+ }
+
+ @Test
+ public void testEnumerateChm() {
+ List<String> chmEntries = chmExtractor.enumerateChm();
+ assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
+ chmEntries.size());
+ }
+
+ @Test
+ public void testGetChmDirList() {
+ assertNotNull(chmExtractor.getChmDirList());
+ }
+
+ @Test
+ public void testExtractChmEntry() throws TikaException{
+ ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
+
+ int count = 0;
+ for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
+ chmExtractor.extractChmEntry(directoryListingEntry);
+ ++count;
+ }
+ assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
index 6bda44a..05d3820 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java
@@ -1,122 +1,122 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public functions of ChmItsfHeader
- *
- */
-public class TestChmItsfHeader {
- private ChmItsfHeader chmItsfHeader = null;
-
- @Before
- public void setUp() throws Exception {
- chmItsfHeader = new ChmItsfHeader();
- byte[] data = TestParameters.chmData;
- // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
- chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
- }
-
- @Test
- public void getDataOffset() {
- assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
- chmItsfHeader.getDataOffset());
- }
-
- @Test
- public void getDir_uuid() {
- assertNotNull(chmItsfHeader.getDir_uuid());
- }
-
- @Test
- public void getDirLen() {
- assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
- chmItsfHeader.getDirLen());
- }
-
- @Test
- public void getDirOffset() {
- assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
- chmItsfHeader.getDirOffset());
- }
-
- @Test
- public void getHeaderLen() {
- assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
- chmItsfHeader.getHeaderLen());
- }
-
- @Test
- public void getLangId() {
- assertEquals(TestParameters.VP_LANGUAGE_ID,
- chmItsfHeader.getLangId());
- }
-
- @Test
- public void getLastModified() {
- assertEquals(TestParameters.VP_LAST_MODIFIED,
- chmItsfHeader.getLastModified());
- }
-
- @Test
- public void getUnknown_000c() {
- assertEquals(TestParameters.VP_UNKNOWN_000C,
- chmItsfHeader.getUnknown_000c());
- }
-
- @Test
- public void getUnknownLen() {
- assertEquals(TestParameters.VP_UNKNOWN_LEN,
- chmItsfHeader.getUnknownLen());
- }
-
- @Test
- public void getUnknownOffset() {
- assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
- chmItsfHeader.getUnknownOffset());
- }
-
- @Test
- public void getVersion() {
- assertEquals(TestParameters.VP_VERSION,
- chmItsfHeader.getVersion());
- }
-
- @Test
- public void testToString() {
- assertTrue(chmItsfHeader.toString().contains(
- TestParameters.VP_ISTF_SIGNATURE));
- }
-
- @After
- public void tearDown() throws Exception {
- chmItsfHeader = null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public functions of ChmItsfHeader
+ *
+ */
+public class TestChmItsfHeader {
+ private ChmItsfHeader chmItsfHeader = null;
+
+ @Before
+ public void setUp() throws Exception {
+ chmItsfHeader = new ChmItsfHeader();
+ byte[] data = TestParameters.chmData;
+ // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+ chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+ }
+
+ @Test
+ public void getDataOffset() {
+ assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
+ chmItsfHeader.getDataOffset());
+ }
+
+ @Test
+ public void getDir_uuid() {
+ assertNotNull(chmItsfHeader.getDir_uuid());
+ }
+
+ @Test
+ public void getDirLen() {
+ assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
+ chmItsfHeader.getDirLen());
+ }
+
+ @Test
+ public void getDirOffset() {
+ assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
+ chmItsfHeader.getDirOffset());
+ }
+
+ @Test
+ public void getHeaderLen() {
+ assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
+ chmItsfHeader.getHeaderLen());
+ }
+
+ @Test
+ public void getLangId() {
+ assertEquals(TestParameters.VP_LANGUAGE_ID,
+ chmItsfHeader.getLangId());
+ }
+
+ @Test
+ public void getLastModified() {
+ assertEquals(TestParameters.VP_LAST_MODIFIED,
+ chmItsfHeader.getLastModified());
+ }
+
+ @Test
+ public void getUnknown_000c() {
+ assertEquals(TestParameters.VP_UNKNOWN_000C,
+ chmItsfHeader.getUnknown_000c());
+ }
+
+ @Test
+ public void getUnknownLen() {
+ assertEquals(TestParameters.VP_UNKNOWN_LEN,
+ chmItsfHeader.getUnknownLen());
+ }
+
+ @Test
+ public void getUnknownOffset() {
+ assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
+ chmItsfHeader.getUnknownOffset());
+ }
+
+ @Test
+ public void getVersion() {
+ assertEquals(TestParameters.VP_VERSION,
+ chmItsfHeader.getVersion());
+ }
+
+ @Test
+ public void testToString() {
+ assertTrue(chmItsfHeader.toString().contains(
+ TestParameters.VP_ISTF_SIGNATURE));
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ chmItsfHeader = null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
index 91e4ba6..e78e7c8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java
@@ -1,160 +1,160 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public methods of the ChmItspHeader
- *
- */
-public class TestChmItspHeader {
- private ChmItspHeader chmItspHeader = null;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
-
- ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
- // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
- chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
-
- chmItspHeader = new ChmItspHeader();
- // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
- // chmItsfHeader.getDirOffset(),
- // (int) chmItsfHeader.getDirOffset()
- // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- chmItspHeader.parse(ChmCommons.copyOfRange(data,
- (int) chmItsfHeader.getDirOffset(),
- (int) chmItsfHeader.getDirOffset()
- + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- }
-
- @Test
- public void testGetBlock_len() {
- assertEquals(TestParameters.VP_BLOCK_LENGTH,
- chmItspHeader.getBlock_len());
- }
-
- @Test
- public void testGetBlockidx_intvl() {
- assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL,
- chmItspHeader.getBlockidx_intvl());
- }
-
- @Test
- public void testGetHeader_len() {
- assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH,
- chmItspHeader.getHeader_len());
- }
-
- @Test
- public void testGetIndex_depth() {
- assertEquals(TestParameters.VP_INDEX_DEPTH,
- chmItspHeader.getIndex_depth());
- }
-
- @Test
- public void testGetIndex_head() {
- assertEquals(TestParameters.VP_INDEX_HEAD,
- chmItspHeader.getIndex_head());
- }
-
- @Test
- public void testGetIndex_root() {
- assertEquals(TestParameters.VP_INDEX_ROOT,
- chmItspHeader.getIndex_root());
- }
-
- @Test
- public void testGetLang_id() {
- assertEquals(TestParameters.VP_LANGUAGE_ID,
- chmItspHeader.getLang_id());
- }
-
- @Test
- public void testGetNum_blocks() {
- assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS,
- chmItspHeader.getNum_blocks());
- }
-
- @Test
- public void testGetUnknown_000c() {
- assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C,
- chmItspHeader.getUnknown_000c());
- }
-
- @Test
- public void testGetUnknown_0024() {
- assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024,
- chmItspHeader.getUnknown_0024());
- }
-
- @Test
- public void testGetUnknown_002() {
- assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C,
- chmItspHeader.getUnknown_002c());
- }
-
- @Test
- public void testGetUnknown_0044() {
- assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
- chmItspHeader.getUnknown_0044().length);
- }
-
- @Test
- public void testGetVersion() {
- assertEquals(TestParameters.VP_ITSP_VERSION,
- chmItspHeader.getVersion());
- }
-
- @Test
- public void testGetSignature() {
- assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String(
- chmItspHeader.getSignature(), UTF_8));
- }
-
- @Test
- public void testGetSystem_uuid() {
- assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
- chmItspHeader.getSystem_uuid().length);
- }
-
- @Test
- public void testToString() {
- assertTrue(chmItspHeader.toString().contains(
- TestParameters.VP_ISTP_SIGNATURE));
- }
-
- @After
- public void tearDown() throws Exception {
- chmItspHeader = null;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public methods of the ChmItspHeader
+ *
+ */
+public class TestChmItspHeader {
+ private ChmItspHeader chmItspHeader = null;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+
+ ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
+ // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+ chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+
+ chmItspHeader = new ChmItspHeader();
+ // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+ // chmItsfHeader.getDirOffset(),
+ // (int) chmItsfHeader.getDirOffset()
+ // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ chmItspHeader.parse(ChmCommons.copyOfRange(data,
+ (int) chmItsfHeader.getDirOffset(),
+ (int) chmItsfHeader.getDirOffset()
+ + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ }
+
+ @Test
+ public void testGetBlock_len() {
+ assertEquals(TestParameters.VP_BLOCK_LENGTH,
+ chmItspHeader.getBlock_len());
+ }
+
+ @Test
+ public void testGetBlockidx_intvl() {
+ assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL,
+ chmItspHeader.getBlockidx_intvl());
+ }
+
+ @Test
+ public void testGetHeader_len() {
+ assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH,
+ chmItspHeader.getHeader_len());
+ }
+
+ @Test
+ public void testGetIndex_depth() {
+ assertEquals(TestParameters.VP_INDEX_DEPTH,
+ chmItspHeader.getIndex_depth());
+ }
+
+ @Test
+ public void testGetIndex_head() {
+ assertEquals(TestParameters.VP_INDEX_HEAD,
+ chmItspHeader.getIndex_head());
+ }
+
+ @Test
+ public void testGetIndex_root() {
+ assertEquals(TestParameters.VP_INDEX_ROOT,
+ chmItspHeader.getIndex_root());
+ }
+
+ @Test
+ public void testGetLang_id() {
+ assertEquals(TestParameters.VP_LANGUAGE_ID,
+ chmItspHeader.getLang_id());
+ }
+
+ @Test
+ public void testGetNum_blocks() {
+ assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS,
+ chmItspHeader.getNum_blocks());
+ }
+
+ @Test
+ public void testGetUnknown_000c() {
+ assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C,
+ chmItspHeader.getUnknown_000c());
+ }
+
+ @Test
+ public void testGetUnknown_0024() {
+ assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024,
+ chmItspHeader.getUnknown_0024());
+ }
+
+ @Test
+ public void testGetUnknown_002() {
+ assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C,
+ chmItspHeader.getUnknown_002c());
+ }
+
+ @Test
+ public void testGetUnknown_0044() {
+ assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
+ chmItspHeader.getUnknown_0044().length);
+ }
+
+ @Test
+ public void testGetVersion() {
+ assertEquals(TestParameters.VP_ITSP_VERSION,
+ chmItspHeader.getVersion());
+ }
+
+ @Test
+ public void testGetSignature() {
+ assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String(
+ chmItspHeader.getSignature(), UTF_8));
+ }
+
+ @Test
+ public void testGetSystem_uuid() {
+ assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
+ chmItspHeader.getSystem_uuid().length);
+ }
+
+ @Test
+ public void testToString() {
+ assertTrue(chmItspHeader.toString().contains(
+ TestParameters.VP_ISTP_SIGNATURE));
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ chmItspHeader = null;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
index d40874f..c8a8eb7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java
@@ -1,101 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.apache.tika.parser.chm.lzx.ChmLzxState;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmLzxState {
- private ChmLzxState chmLzxState;
- private int windowSize;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
-
- /* Creates and parses itsf header */
- ChmItsfHeader chmItsHeader = new ChmItsfHeader();
- // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- /* Creates and parses itsp block */
- ChmItspHeader chmItspHeader = new ChmItspHeader();
- // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
- // chmItsHeader.getDirOffset(),
- // (int) chmItsHeader.getDirOffset()
- // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- chmItspHeader.parse(ChmCommons.copyOfRange(data,
- (int) chmItsHeader.getDirOffset(),
- (int) chmItsHeader.getDirOffset()
- + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
-
- /* Creating instance of ChmDirListingContainer */
- ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
- data, chmItsHeader, chmItspHeader);
- int indexOfControlData = ChmCommons.indexOf(
- chmDirListCont.getDirectoryListingEntryList(),
- ChmConstants.CONTROL_DATA);
-
- int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetTable > 0) {
- // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
- // indexOfResetTable
- // +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
- dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
- indexOfResetTable
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfControlData).getLength());
- }
-
- ChmLzxcControlData clcd = new ChmLzxcControlData();
- clcd.parse(dir_chunk, clcd);
- windowSize = (int) clcd.getWindowSize();
- }
-
- @Test
- public void testChmLzxStateConstructor() throws TikaException {
- chmLzxState = new ChmLzxState(windowSize);
- assertNotNull(chmLzxState);
- }
-
- @Test
- public void testToString() throws TikaException {
- if (chmLzxState == null)
- testChmLzxStateConstructor();
- assertTrue(chmLzxState.toString().length() > 20);
- }
-
- // TODO add more tests
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.lzx.ChmLzxState;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmLzxState {
+ private ChmLzxState chmLzxState;
+ private int windowSize;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+
+ /* Creates and parses itsf header */
+ ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+ // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ /* Creates and parses itsp block */
+ ChmItspHeader chmItspHeader = new ChmItspHeader();
+ // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+ // chmItsHeader.getDirOffset(),
+ // (int) chmItsHeader.getDirOffset()
+ // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ chmItspHeader.parse(ChmCommons.copyOfRange(data,
+ (int) chmItsHeader.getDirOffset(),
+ (int) chmItsHeader.getDirOffset()
+ + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+
+ /* Creating instance of ChmDirListingContainer */
+ ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+ data, chmItsHeader, chmItspHeader);
+ int indexOfControlData = ChmCommons.indexOf(
+ chmDirListCont.getDirectoryListingEntryList(),
+ ChmConstants.CONTROL_DATA);
+
+ int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetTable > 0) {
+ // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+ // indexOfResetTable
+ // +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+ indexOfResetTable
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfControlData).getLength());
+ }
+
+ ChmLzxcControlData clcd = new ChmLzxcControlData();
+ clcd.parse(dir_chunk, clcd);
+ windowSize = (int) clcd.getWindowSize();
+ }
+
+ @Test
+ public void testChmLzxStateConstructor() throws TikaException {
+ chmLzxState = new ChmLzxState(windowSize);
+ assertNotNull(chmLzxState);
+ }
+
+ @Test
+ public void testToString() throws TikaException {
+ if (chmLzxState == null)
+ testChmLzxStateConstructor();
+ assertTrue(chmLzxState.toString().length() > 20);
+ }
+
+ // TODO add more tests
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
index 4449b70..e7992bf 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java
@@ -1,144 +1,144 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Tests all public methods of ChmLzxcControlData block
- */
-public class TestChmLzxcControlData {
- private ChmLzxcControlData chmLzxcControlData = null;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
- /* Creates and parses itsf header */
- ChmItsfHeader chmItsHeader = new ChmItsfHeader();
- // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
- /* Creates and parses itsp block */
- ChmItspHeader chmItspHeader = new ChmItspHeader();
- // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
- // chmItsHeader.getDirOffset(),
- // (int) chmItsHeader.getDirOffset()
- // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- chmItspHeader.parse(ChmCommons.copyOfRange(data,
- (int) chmItsHeader.getDirOffset(),
- (int) chmItsHeader.getDirOffset()
- + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- /* Creating instance of ChmDirListingContainer */
- ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
- data, chmItsHeader, chmItspHeader);
- int indexOfControlData = chmDirListCont.getControlDataIndex();
-
- int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetTable > 0) {
- // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
- // indexOfResetTable
- // +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
- dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
- indexOfResetTable
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfControlData).getLength());
- }
-
- /* Creates and parses control block */
- chmLzxcControlData = new ChmLzxcControlData();
- chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
- }
-
- @Test
- public void testConstructorNotNull() {
- assertNotNull(chmLzxcControlData);
- }
-
- @Test
- public void testGetResetInterval() {
- assertEquals(TestParameters.VP_RESET_INTERVAL,
- chmLzxcControlData.getResetInterval());
- }
-
- @Test
- public void testGetSize() {
- assertEquals(TestParameters.VP_CONTROL_DATA_SIZE,
- chmLzxcControlData.getSize());
- }
-
- @Test
- public void testGetUnknown_18() {
- assertEquals(TestParameters.VP_UNKNOWN_18,
- chmLzxcControlData.getUnknown_18());
- }
-
- @Test
- public void testGetVersion() {
- assertEquals(TestParameters.VP_CONTROL_DATA_VERSION,
- chmLzxcControlData.getVersion());
- }
-
- @Test
- public void testGetWindowSize() {
- assertEquals(TestParameters.VP_WINDOW_SIZE,
- chmLzxcControlData.getWindowSize());
- }
-
- @Test
- public void testGetWindowsPerReset() {
- assertEquals(TestParameters.VP_WINDOWS_PER_RESET,
- chmLzxcControlData.getWindowsPerReset());
- }
-
- @Test
- public void testGetToString() {
- assertTrue(chmLzxcControlData.toString().contains(
- TestParameters.VP_CONTROL_DATA_SIGNATURE));
- }
-
- @Test
- public void testGetSignature() {
- assertEquals(
- TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
- chmLzxcControlData.getSignature().length);
- }
-
- @Test
- public void testGetSignaure() {
- assertEquals(
- TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
- chmLzxcControlData.getSignature().length);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests all public methods of ChmLzxcControlData block
+ */
+public class TestChmLzxcControlData {
+ private ChmLzxcControlData chmLzxcControlData = null;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+ /* Creates and parses itsf header */
+ ChmItsfHeader chmItsHeader = new ChmItsfHeader();
+ // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
+ /* Creates and parses itsp block */
+ ChmItspHeader chmItspHeader = new ChmItspHeader();
+ // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+ // chmItsHeader.getDirOffset(),
+ // (int) chmItsHeader.getDirOffset()
+ // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ chmItspHeader.parse(ChmCommons.copyOfRange(data,
+ (int) chmItsHeader.getDirOffset(),
+ (int) chmItsHeader.getDirOffset()
+ + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ /* Creating instance of ChmDirListingContainer */
+ ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+ data, chmItsHeader, chmItspHeader);
+ int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+ int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetTable > 0) {
+ // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+ // indexOfResetTable
+ // +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+ indexOfResetTable
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfControlData).getLength());
+ }
+
+ /* Creates and parses control block */
+ chmLzxcControlData = new ChmLzxcControlData();
+ chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+ }
+
+ @Test
+ public void testConstructorNotNull() {
+ assertNotNull(chmLzxcControlData);
+ }
+
+ @Test
+ public void testGetResetInterval() {
+ assertEquals(TestParameters.VP_RESET_INTERVAL,
+ chmLzxcControlData.getResetInterval());
+ }
+
+ @Test
+ public void testGetSize() {
+ assertEquals(TestParameters.VP_CONTROL_DATA_SIZE,
+ chmLzxcControlData.getSize());
+ }
+
+ @Test
+ public void testGetUnknown_18() {
+ assertEquals(TestParameters.VP_UNKNOWN_18,
+ chmLzxcControlData.getUnknown_18());
+ }
+
+ @Test
+ public void testGetVersion() {
+ assertEquals(TestParameters.VP_CONTROL_DATA_VERSION,
+ chmLzxcControlData.getVersion());
+ }
+
+ @Test
+ public void testGetWindowSize() {
+ assertEquals(TestParameters.VP_WINDOW_SIZE,
+ chmLzxcControlData.getWindowSize());
+ }
+
+ @Test
+ public void testGetWindowsPerReset() {
+ assertEquals(TestParameters.VP_WINDOWS_PER_RESET,
+ chmLzxcControlData.getWindowsPerReset());
+ }
+
+ @Test
+ public void testGetToString() {
+ assertTrue(chmLzxcControlData.toString().contains(
+ TestParameters.VP_CONTROL_DATA_SIGNATURE));
+ }
+
+ @Test
+ public void testGetSignature() {
+ assertEquals(
+ TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
+ chmLzxcControlData.getSignature().length);
+ }
+
+ @Test
+ public void testGetSignaure() {
+ assertEquals(
+ TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes(UTF_8).length,
+ chmLzxcControlData.getSignature().length);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
index d84f702..79c2804 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java
@@ -1,156 +1,156 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestChmLzxcResetTable {
- private ChmLzxcResetTable chmLzxcResetTable = null;
-
- @Before
- public void setUp() throws Exception {
- byte[] data = TestParameters.chmData;
- /* Creates and parses itsf header */
- ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
- // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
- chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
- /* Creates and parses itsp block */
- ChmItspHeader chmItspHeader = new ChmItspHeader();
- // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
- // chmItsfHeader.getDirOffset(),
- // (int) chmItsfHeader.getDirOffset()
- // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- chmItspHeader.parse(ChmCommons.copyOfRange(data,
- (int) chmItsfHeader.getDirOffset(),
- (int) chmItsfHeader.getDirOffset()
- + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
- /* Creating instance of ChmDirListingContainer */
- ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
- data, chmItsfHeader, chmItspHeader);
- int indexOfControlData = chmDirListCont.getControlDataIndex();
-
- int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetTable > 0) {
- // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
- // indexOfResetTable
- // +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
- dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
- indexOfResetTable
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfControlData).getLength());
- }
-
- /* Creates and parses control block */
- ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
- chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
-
- indexOfResetTable = chmDirListCont.getResetTableIndex();
- chmLzxcResetTable = new ChmLzxcResetTable();
-
- int startIndex = (int) chmDirListCont.getDataOffset()
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfResetTable).getOffset();
-
- ChmAssert.assertCopyingDataIndex(startIndex, data.length);
-
- // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
- // +
- // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
- dir_chunk = ChmCommons.copyOfRange(
- data,
- startIndex,
- startIndex
- + chmDirListCont.getDirectoryListingEntryList()
- .get(indexOfResetTable).getLength());
-
- chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
- }
-
- @Test
- public void testGetBlockAddress() {
- assertEquals(TestParameters.VP_RESET_TABLE_BA,
- chmLzxcResetTable.getBlockAddress().length);
- }
-
- @Test
- public void testGetBlockCount() {
- assertEquals(TestParameters.VP_RESET_TABLE_BA,
- chmLzxcResetTable.getBlockCount());
- }
-
- @Test
- public void testGetBlockLen() {
- assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH,
- chmLzxcResetTable.getBlockLen());
- }
-
- @Test
- public void testGetCompressedLen() {
- assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH,
- chmLzxcResetTable.getCompressedLen());
- }
-
- @Test
- public void testGetTableOffset() {
- assertEquals(TestParameters.VP_TBL_OFFSET,
- chmLzxcResetTable.getTableOffset());
- }
-
- @Test
- public void testGetUncompressedLen() {
- assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH,
- chmLzxcResetTable.getUncompressedLen());
- }
-
- @Test
- public void testGetUnknown() {
- assertEquals(TestParameters.VP_RES_TBL_UNKNOWN,
- chmLzxcResetTable.getUnknown());
- }
-
- @Test
- public void testGetVersion() {
- assertEquals(TestParameters.VP_RES_TBL_VERSION,
- chmLzxcResetTable.getVersion());
- }
-
- @Test
- public void testToString() {
- assertTrue(chmLzxcResetTable.toString().length() > 0);
- }
-
- // TODO: add setters to be tested
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestChmLzxcResetTable {
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+
+ @Before
+ public void setUp() throws Exception {
+ byte[] data = TestParameters.chmData;
+ /* Creates and parses itsf header */
+ ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
+ // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+ chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
+ /* Creates and parses itsp block */
+ ChmItspHeader chmItspHeader = new ChmItspHeader();
+ // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
+ // chmItsfHeader.getDirOffset(),
+ // (int) chmItsfHeader.getDirOffset()
+ // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ chmItspHeader.parse(ChmCommons.copyOfRange(data,
+ (int) chmItsfHeader.getDirOffset(),
+ (int) chmItsfHeader.getDirOffset()
+ + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
+ /* Creating instance of ChmDirListingContainer */
+ ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
+ data, chmItsfHeader, chmItspHeader);
+ int indexOfControlData = chmDirListCont.getControlDataIndex();
+
+ int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetTable > 0) {
+ // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
+ // indexOfResetTable
+ // +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
+ indexOfResetTable
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfControlData).getLength());
+ }
+
+ /* Creates and parses control block */
+ ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
+ chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
+
+ indexOfResetTable = chmDirListCont.getResetTableIndex();
+ chmLzxcResetTable = new ChmLzxcResetTable();
+
+ int startIndex = (int) chmDirListCont.getDataOffset()
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfResetTable).getOffset();
+
+ ChmAssert.assertCopyingDataIndex(startIndex, data.length);
+
+ // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
+ // +
+ // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+ dir_chunk = ChmCommons.copyOfRange(
+ data,
+ startIndex,
+ startIndex
+ + chmDirListCont.getDirectoryListingEntryList()
+ .get(indexOfResetTable).getLength());
+
+ chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
+ }
+
+ @Test
+ public void testGetBlockAddress() {
+ assertEquals(TestParameters.VP_RESET_TABLE_BA,
+ chmLzxcResetTable.getBlockAddress().length);
+ }
+
+ @Test
+ public void testGetBlockCount() {
+ assertEquals(TestParameters.VP_RESET_TABLE_BA,
+ chmLzxcResetTable.getBlockCount());
+ }
+
+ @Test
+ public void testGetBlockLen() {
+ assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH,
+ chmLzxcResetTable.getBlockLen());
+ }
+
+ @Test
+ public void testGetCompressedLen() {
+ assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH,
+ chmLzxcResetTable.getCompressedLen());
+ }
+
+ @Test
+ public void testGetTableOffset() {
+ assertEquals(TestParameters.VP_TBL_OFFSET,
+ chmLzxcResetTable.getTableOffset());
+ }
+
+ @Test
+ public void testGetUncompressedLen() {
+ assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH,
+ chmLzxcResetTable.getUncompressedLen());
+ }
+
+ @Test
+ public void testGetUnknown() {
+ assertEquals(TestParameters.VP_RES_TBL_UNKNOWN,
+ chmLzxcResetTable.getUnknown());
+ }
+
+ @Test
+ public void testGetVersion() {
+ assertEquals(TestParameters.VP_RES_TBL_VERSION,
+ chmLzxcResetTable.getVersion());
+ }
+
+ @Test
+ public void testToString() {
+ assertTrue(chmLzxcResetTable.toString().length() > 0);
+ }
+
+ // TODO: add setters to be tested
+}
[14/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index d80842b..1b692bf 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -1,510 +1,510 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.rtf;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.RTFMetadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Junit test class for the Tika {@link RTFParser}
- */
-public class RTFParserTest extends TikaTest {
-
- private Tika tika = new Tika();
-
- @Test
- public void testBasicExtraction() throws Exception {
-
- XMLResult r = getXML("testRTF.rtf");
- assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
- assertContains("Test", r.xml);
- assertContains("indexation Word", r.xml);
- }
-
- @Test
- public void testUmlautSpacesExtraction2() throws Exception {
- assertContains("<p>\u00DCbersicht</p>",
- getXML("testRTFUmlautSpaces2.rtf").xml);
- }
-
- @Test
- public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
- XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
-
- assertContains("\u5E74", r.xml);
- assertContains("\u5ff5", r.xml);
- assertContains("0 ", r.xml);
- assertContains("abc", r.xml);
- assertNotContained("\u5E74\u5E74", r.xml);
- }
-
- @Test
- public void testHexEscapeInsideWord() throws Exception {
- XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
- assertContains("ESP\u00cdRITO", r.xml);
- }
-
- @Test
- public void testWindowsCodepage1250() throws Exception {
- XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
- assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
- assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
- }
-
- @Test
- public void testTableCellSeparation() throws Exception {
- String content = getXML("testRTFTableCellSeparation.rtf").xml;
- content = content.replaceAll("(\\s|<\\/?p>)+", " ");
- assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
- }
-
- @Test
- public void testTableCellSeparation2() throws Exception {
- String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
- // TODO: why do we insert extra whitespace...?
- assertContains("Station</p> <p>Fax", content);
- }
-
- @Test
- public void testWordPadCzechCharactersExtraction() throws Exception {
- XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
- assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
- assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
- }
-
- @Test
- public void testWord2010CzechCharactersExtraction() throws Exception {
- XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
- assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
- assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
- }
-
- @Test
- public void testMS932Extraction() throws Exception {
- XMLResult r = getXML("testRTF-ms932.rtf");
- // Hello in Japanese
- assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
-
- // Verify title, since it was also encoded with MS932:
- r = getXML("testRTF-ms932.rtf");
- assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
- }
-
- @Test
- public void testUmlautSpacesExtraction() throws Exception {
- XMLResult r = getXML("testRTFUmlautSpaces.rtf");
- assertContains("\u00DCbersicht", r.xml);
- }
-
- @Test
- public void testGothic() throws Exception {
- XMLResult r = getXML("testRTFUnicodeGothic.rtf");
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
- }
-
- @Test
- public void testJapaneseText() throws Exception {
- XMLResult r = getXML("testRTFJapanese.rtf");
-
- // Verify title -- this title uses upr escape inside
- // title info field:
- assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
- r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
- assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
-
- // Special version of (GHQ)
- assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
-
- // 6 other characters
- assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
- }
-
- @Test
- public void testMaxLength() throws Exception {
- Metadata metadata = new Metadata();
- InputStream stream = TikaInputStream.get(
- getTestDocumentAsStream("testRTFJapanese.rtf"));
-
- // Test w/ default limit:
- Tika localTika = new Tika();
- String content = localTika.parseToString(stream, metadata);
- // parseToString closes for convenience:
- //stream.close();
- assertTrue(content.length() > 500);
-
- // Test setting max length on the instance:
- localTika.setMaxStringLength(200);
- stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
- content = localTika.parseToString(stream, metadata);
-
- // parseToString closes for convenience:
- //stream.close();
- assertTrue(content.length() <= 200);
-
- // Test setting max length per-call:
- stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
- content = localTika.parseToString(stream, metadata, 100);
- // parseToString closes for convenience:
- //stream.close();
- assertTrue(content.length() <= 100);
- }
-
- @Test
- public void testTextWithCurlyBraces() throws Exception {
- XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
- assertContains("{ some text inside curly brackets }", r.xml);
- }
-
- @Test
- public void testControls() throws Exception {
- XMLResult r = getXML("testRTFControls.rtf");
- String content = r.xml;
- assertContains("Thiswordhasanem\u2014dash", content);
- assertContains("Thiswordhasanen\u2013dash", content);
- assertContains("Thiswordhasanon\u2011breakinghyphen", content);
- assertContains("Thiswordhasanonbreaking\u00a0space", content);
- assertContains("Thiswordhasanoptional\u00adhyphen", content);
- assertContains("\u2018Single quoted text\u2019", content);
- assertContains("\u201cDouble quoted text\u201d", content);
- assertContains("\u201cDouble quoted text again\u201d", content);
- }
-
- @Test
- public void testInvalidUnicode() throws Exception {
- XMLResult r = getXML("testRTFInvalidUnicode.rtf");
- String content = r.xml;
- assertContains("Unpaired hi \ufffd here", content);
- assertContains("Unpaired lo \ufffd here", content);
- assertContains("Mismatched pair \ufffd\ufffd here", content);
- }
-
- @Test
- public void testVarious() throws Exception {
- XMLResult r = getXML("testRTFVarious.rtf");
- String content = r.xml;
- assertContains("Footnote appears here", content);
- assertContains("This is a footnote.", content);
- assertContains("This is the header text.", content);
- assertContains("This is the footer text.", content);
- assertContains("Here is a text box", content);
- assertContains("Bold", content);
- assertContains("italic", content);
- assertContains("underline", content);
- assertContains("superscript", content);
- assertContains("subscript", content);
- assertContains("Here is a citation:", content);
- assertContains("Figure 1 This is a caption for Figure 1", content);
- assertContains("(Kramer)", content);
-
- // Table
- assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
-
- // 2-columns
- assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
- assertContains("This is a hyperlink", content);
- assertContains("Here is a list:", content);
- for (int row = 1; row <= 3; row++) {
- assertContains("Bullet " + row, content);
- }
- assertContains("Here is a numbered list:", content);
- for (int row = 1; row <= 3; row++) {
- assertContains("Number bullet " + row, content);
- }
-
- for (int row = 1; row <= 2; row++) {
- for (int col = 1; col <= 3; col++) {
- assertContains("Row " + row + " Col " + col, content);
- }
- }
-
- assertContains("Keyword1 Keyword2", content);
- assertEquals("Keyword1 Keyword2",
- r.metadata.get(TikaCoreProperties.KEYWORDS));
-
- assertContains("Subject is here", content);
- assertEquals("Subject is here",
- r.metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Subject is here",
- r.metadata.get(Metadata.SUBJECT));
-
- assertContains("Suddenly some Japanese text:", content);
- // Special version of (GHQ)
- assertContains("\uff08\uff27\uff28\uff31\uff09", content);
- // 6 other characters
- assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
-
- assertContains("And then some Gothic text:", content);
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
- }
-
- @Test
- public void testVariousStyle() throws Exception {
- String content = getXML("testRTFVarious.rtf").xml;
- assertContains("<b>Bold</b>", content);
- assertContains("<i>italic</i>", content);
- }
-
- @Test
- public void testBoldItalic() throws Exception {
- String content = getXML("testRTFBoldItalic.rtf").xml;
- assertContains("<b>bold</b>", content);
- assertContains("<b>bold </b><b><i>italic</i></b>", content);
- assertContains("<b><i>italic </i></b><b>bold</b>", content);
- assertContains("<i>italic</i>", content);
- assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
- assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
- }
-
- @Test
- public void testHyperlink() throws Exception {
- String content = getXML("testRTFHyperlink.rtf").xml;
- assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
- assertEquals(-1, content.indexOf("<p>\t\t</p>"));
- }
-
- @Test
- public void testIgnoredControlWord() throws Exception {
- assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
- }
-
- @Test
- public void testFontAfterBufferedText() throws Exception {
- assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
- getXML("testFontAfterBufferedText.rtf").xml);
- }
-
- @Test
- public void testListMicrosoftWord() throws Exception {
- String content = getXML("testRTFListMicrosoftWord.rtf").xml;
- assertContains("<ol>\t<li>one</li>", content);
- assertContains("</ol>", content);
- assertContains("<ul>\t<li>first</li>", content);
- assertContains("</ul>", content);
- }
-
- @Test
- public void testListLibreOffice() throws Exception {
- String content = getXML("testRTFListLibreOffice.rtf").xml;
- assertContains("<ol>\t<li>one</li>", content);
- assertContains("</ol>", content);
- assertContains("<ul>\t<li>first</li>", content);
- assertContains("</ul>", content);
- }
-
- // TIKA-782
- @Test
- public void testBinControlWord() throws Exception {
- ByteCopyingHandler embHandler = new ByteCopyingHandler();
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, embHandler);
- }
- assertEquals(1, embHandler.bytes.size());
-
- byte[] bytes = embHandler.bytes.get(0);
- assertEquals(10, bytes.length);
- //}
- assertEquals(125, (int) bytes[4]);
- //make sure that at least the last value is correct
- assertEquals(-1, (int) bytes[9]);
- }
-
- // TIKA-999
- @Test
- public void testMetaDataCounts() throws Exception {
- XMLResult xml = getXML("test_embedded_package.rtf");
- assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
- assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
- assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
- assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
- }
-
- // TIKA-1192
- @Test
- public void testListOverride() throws Exception {
- assertContains("Body", getXML("testRTFListOverride.rtf").xml);
- }
-
- // TIKA-1305
- @Test
- public void testCorruptListOverride() throws Exception {
- assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
- }
-
- // TIKA-1010
- @Test
- public void testEmbeddedMonster() throws Exception {
-
- Map<Integer, Pair> expected = new HashMap<>();
- expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
- expected.put(3, new Pair("file_0.doc", "application/msword"));
- expected.put(6, new Pair("file_1.xlsx",
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
- expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
- expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
- expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
- expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
- expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
- expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
- expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
- expected.put(26, new Pair("file_3.pdf", "application/pdf"));
- expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
- expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
- expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
- expected.put(36, new Pair("file_6.doc", "application/msword"));
- expected.put(39, new Pair("file_7.doc", "application/msword"));
- expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
- expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
-
-
- List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
- assertEquals(48, metadataList.size());
- for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
- Metadata metadata = metadataList.get(e.getKey());
- Pair p = e.getValue();
- assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
- //necessary to getName() because MSOffice extractor includes
- //directory: _1457338524/HW.txt
- assertEquals("filename equals ",
- p.fileName, FilenameUtils.getName(
- metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
-
- assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
- }
- assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
- metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
- }
-
- //TIKA-1010 test regular (not "embedded") images/picts
- @Test
- public void testRegularImages() throws Exception {
- Parser base = new AutoDetectParser();
- ParseContext ctx = new ParseContext();
- RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
- ctx.set(org.apache.tika.parser.Parser.class, parser);
- ContentHandler handler = new BodyContentHandler();
- Metadata rootMetadata = new Metadata();
- rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
- parser.parse(tis, handler, rootMetadata, ctx);
- }
- List<Metadata> metadatas = parser.getMetadata();
-
- Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
- Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
- assertTrue(meta_jpg_exif != null);
- assertTrue(meta_jpg != null);
- // had to comment these out (when moving from 1.x to 2.x
- // because AutoDetectParser within this module does not include image parsing.
-
-// assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
-// assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
- //make sure old metadata doesn't linger between objects
-// assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
- assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
- assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
-
- assertEquals(25, meta_jpg.names().length);
- assertEquals(25, meta_jpg_exif.names().length);
- }
-
- @Test
- public void testMultipleNewlines() throws Exception {
- String content = getXML("testRTFNewlines.rtf").xml;
- content = content.replaceAll("[\r\n]+", " ");
- assertContains("<body><p>one</p> " +
- "<p /> " +
- "<p>two</p> " +
- "<p /> " +
- "<p /> " +
- "<p>three</p> " +
- "<p /> " +
- "<p /> " +
- "<p /> " +
- "<p>four</p>", content);
- }
-
- //TIKA-1010 test linked embedded doc
- @Test
- public void testEmbeddedLinkedDocument() throws Exception {
- Set<MediaType> skipTypes = new HashSet<MediaType>();
- skipTypes.add(MediaType.parse("application/x-emf"));
- skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
- TrackingHandler tracker = new TrackingHandler(skipTypes);
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
- }
- //should gracefully skip link and not throw NPE, IOEx, etc
- assertEquals(0, tracker.filenames.size());
-
- tracker = new TrackingHandler();
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
- }
- //should gracefully skip link and not throw NPE, IOEx, etc
- assertEquals(2, tracker.filenames.size());
- }
-
- private static class Pair {
- final String fileName;
- final String mimeType;
- Pair(String fileName, String mimeType) {
- this.fileName = fileName;
- this.mimeType = mimeType;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Junit test class for the Tika {@link RTFParser}
+ */
+public class RTFParserTest extends TikaTest {
+
+ private Tika tika = new Tika();
+
+ @Test
+ public void testBasicExtraction() throws Exception {
+
+ XMLResult r = getXML("testRTF.rtf");
+ assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+ assertContains("Test", r.xml);
+ assertContains("indexation Word", r.xml);
+ }
+
+ @Test
+ public void testUmlautSpacesExtraction2() throws Exception {
+ assertContains("<p>\u00DCbersicht</p>",
+ getXML("testRTFUmlautSpaces2.rtf").xml);
+ }
+
+ @Test
+ public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
+ XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+
+ assertContains("\u5E74", r.xml);
+ assertContains("\u5ff5", r.xml);
+ assertContains("0 ", r.xml);
+ assertContains("abc", r.xml);
+ assertNotContained("\u5E74\u5E74", r.xml);
+ }
+
+ @Test
+ public void testHexEscapeInsideWord() throws Exception {
+ XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
+ assertContains("ESP\u00cdRITO", r.xml);
+ }
+
+ @Test
+ public void testWindowsCodepage1250() throws Exception {
+ XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
+ assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
+ assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
+ }
+
+ @Test
+ public void testTableCellSeparation() throws Exception {
+ String content = getXML("testRTFTableCellSeparation.rtf").xml;
+ content = content.replaceAll("(\\s|<\\/?p>)+", " ");
+ assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+ }
+
+ @Test
+ public void testTableCellSeparation2() throws Exception {
+ String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
+ // TODO: why do we insert extra whitespace...?
+ assertContains("Station</p> <p>Fax", content);
+ }
+
+ @Test
+ public void testWordPadCzechCharactersExtraction() throws Exception {
+ XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+ }
+
+ @Test
+ public void testWord2010CzechCharactersExtraction() throws Exception {
+ XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
+ }
+
+ @Test
+ public void testMS932Extraction() throws Exception {
+ XMLResult r = getXML("testRTF-ms932.rtf");
+ // Hello in Japanese
+ assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
+
+ // Verify title, since it was also encoded with MS932:
+ r = getXML("testRTF-ms932.rtf");
+ assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
+ }
+
+ @Test
+ public void testUmlautSpacesExtraction() throws Exception {
+ XMLResult r = getXML("testRTFUmlautSpaces.rtf");
+ assertContains("\u00DCbersicht", r.xml);
+ }
+
+ @Test
+ public void testGothic() throws Exception {
+ XMLResult r = getXML("testRTFUnicodeGothic.rtf");
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
+ }
+
+ @Test
+ public void testJapaneseText() throws Exception {
+ XMLResult r = getXML("testRTFJapanese.rtf");
+
+ // Verify title -- this title uses upr escape inside
+ // title info field:
+ assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
+ r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
+ assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
+
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
+
+ // 6 other characters
+ assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
+ }
+
+ @Test
+ public void testMaxLength() throws Exception {
+ Metadata metadata = new Metadata();
+ InputStream stream = TikaInputStream.get(
+ getTestDocumentAsStream("testRTFJapanese.rtf"));
+
+ // Test w/ default limit:
+ Tika localTika = new Tika();
+ String content = localTika.parseToString(stream, metadata);
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() > 500);
+
+ // Test setting max length on the instance:
+ localTika.setMaxStringLength(200);
+ stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+ content = localTika.parseToString(stream, metadata);
+
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() <= 200);
+
+ // Test setting max length per-call:
+ stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
+ content = localTika.parseToString(stream, metadata, 100);
+ // parseToString closes for convenience:
+ //stream.close();
+ assertTrue(content.length() <= 100);
+ }
+
+ @Test
+ public void testTextWithCurlyBraces() throws Exception {
+ XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
+ assertContains("{ some text inside curly brackets }", r.xml);
+ }
+
+ @Test
+ public void testControls() throws Exception {
+ XMLResult r = getXML("testRTFControls.rtf");
+ String content = r.xml;
+ assertContains("Thiswordhasanem\u2014dash", content);
+ assertContains("Thiswordhasanen\u2013dash", content);
+ assertContains("Thiswordhasanon\u2011breakinghyphen", content);
+ assertContains("Thiswordhasanonbreaking\u00a0space", content);
+ assertContains("Thiswordhasanoptional\u00adhyphen", content);
+ assertContains("\u2018Single quoted text\u2019", content);
+ assertContains("\u201cDouble quoted text\u201d", content);
+ assertContains("\u201cDouble quoted text again\u201d", content);
+ }
+
+ @Test
+ public void testInvalidUnicode() throws Exception {
+ XMLResult r = getXML("testRTFInvalidUnicode.rtf");
+ String content = r.xml;
+ assertContains("Unpaired hi \ufffd here", content);
+ assertContains("Unpaired lo \ufffd here", content);
+ assertContains("Mismatched pair \ufffd\ufffd here", content);
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ XMLResult r = getXML("testRTFVarious.rtf");
+ String content = r.xml;
+ assertContains("Footnote appears here", content);
+ assertContains("This is a footnote.", content);
+ assertContains("This is the header text.", content);
+ assertContains("This is the footer text.", content);
+ assertContains("Here is a text box", content);
+ assertContains("Bold", content);
+ assertContains("italic", content);
+ assertContains("underline", content);
+ assertContains("superscript", content);
+ assertContains("subscript", content);
+ assertContains("Here is a citation:", content);
+ assertContains("Figure 1 This is a caption for Figure 1", content);
+ assertContains("(Kramer)", content);
+
+ // Table
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
+
+ // 2-columns
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
+ assertContains("This is a hyperlink", content);
+ assertContains("Here is a list:", content);
+ for (int row = 1; row <= 3; row++) {
+ assertContains("Bullet " + row, content);
+ }
+ assertContains("Here is a numbered list:", content);
+ for (int row = 1; row <= 3; row++) {
+ assertContains("Number bullet " + row, content);
+ }
+
+ for (int row = 1; row <= 2; row++) {
+ for (int col = 1; col <= 3; col++) {
+ assertContains("Row " + row + " Col " + col, content);
+ }
+ }
+
+ assertContains("Keyword1 Keyword2", content);
+ assertEquals("Keyword1 Keyword2",
+ r.metadata.get(TikaCoreProperties.KEYWORDS));
+
+ assertContains("Subject is here", content);
+ assertEquals("Subject is here",
+ r.metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Subject is here",
+ r.metadata.get(Metadata.SUBJECT));
+
+ assertContains("Suddenly some Japanese text:", content);
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+ // 6 other characters
+ assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+ assertContains("And then some Gothic text:", content);
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ }
+
+ @Test
+ public void testVariousStyle() throws Exception {
+ String content = getXML("testRTFVarious.rtf").xml;
+ assertContains("<b>Bold</b>", content);
+ assertContains("<i>italic</i>", content);
+ }
+
+ @Test
+ public void testBoldItalic() throws Exception {
+ String content = getXML("testRTFBoldItalic.rtf").xml;
+ assertContains("<b>bold</b>", content);
+ assertContains("<b>bold </b><b><i>italic</i></b>", content);
+ assertContains("<b><i>italic </i></b><b>bold</b>", content);
+ assertContains("<i>italic</i>", content);
+ assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
+ assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
+ }
+
+ @Test
+ public void testHyperlink() throws Exception {
+ String content = getXML("testRTFHyperlink.rtf").xml;
+ assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
+ assertEquals(-1, content.indexOf("<p>\t\t</p>"));
+ }
+
+ @Test
+ public void testIgnoredControlWord() throws Exception {
+ assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
+ }
+
+ @Test
+ public void testFontAfterBufferedText() throws Exception {
+ assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
+ getXML("testFontAfterBufferedText.rtf").xml);
+ }
+
+ @Test
+ public void testListMicrosoftWord() throws Exception {
+ String content = getXML("testRTFListMicrosoftWord.rtf").xml;
+ assertContains("<ol>\t<li>one</li>", content);
+ assertContains("</ol>", content);
+ assertContains("<ul>\t<li>first</li>", content);
+ assertContains("</ul>", content);
+ }
+
+ @Test
+ public void testListLibreOffice() throws Exception {
+ String content = getXML("testRTFListLibreOffice.rtf").xml;
+ assertContains("<ol>\t<li>one</li>", content);
+ assertContains("</ol>", content);
+ assertContains("<ul>\t<li>first</li>", content);
+ assertContains("</ul>", content);
+ }
+
+ // TIKA-782
+ @Test
+ public void testBinControlWord() throws Exception {
+ ByteCopyingHandler embHandler = new ByteCopyingHandler();
+ try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
+ ContainerExtractor ex = new ParserContainerExtractor();
+ assertEquals(true, ex.isSupported(tis));
+ ex.extract(tis, ex, embHandler);
+ }
+ assertEquals(1, embHandler.bytes.size());
+
+ byte[] bytes = embHandler.bytes.get(0);
+ assertEquals(10, bytes.length);
+ //}
+ assertEquals(125, (int) bytes[4]);
+ //make sure that at least the last value is correct
+ assertEquals(-1, (int) bytes[9]);
+ }
+
+ // TIKA-999
+ @Test
+ public void testMetaDataCounts() throws Exception {
+ XMLResult xml = getXML("test_embedded_package.rtf");
+ assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
+ assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
+ assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
+ assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
+ }
+
+ // TIKA-1192
+ @Test
+ public void testListOverride() throws Exception {
+ assertContains("Body", getXML("testRTFListOverride.rtf").xml);
+ }
+
+ // TIKA-1305
+ @Test
+ public void testCorruptListOverride() throws Exception {
+ assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
+ }
+
+ // TIKA-1010
+ @Test
+ public void testEmbeddedMonster() throws Exception {
+
+ Map<Integer, Pair> expected = new HashMap<>();
+ expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+ expected.put(3, new Pair("file_0.doc", "application/msword"));
+ expected.put(6, new Pair("file_1.xlsx",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+ expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+ expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+ expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+ expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+ expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+ expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+ expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+ expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+ expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+ expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+ expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+ expected.put(36, new Pair("file_6.doc", "application/msword"));
+ expected.put(39, new Pair("file_7.doc", "application/msword"));
+ expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+ List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+ assertEquals(48, metadataList.size());
+ for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+ Metadata metadata = metadataList.get(e.getKey());
+ Pair p = e.getValue();
+ assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ //necessary to getName() because MSOffice extractor includes
+ //directory: _1457338524/HW.txt
+ assertEquals("filename equals ",
+ p.fileName, FilenameUtils.getName(
+ metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+ assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
+ }
+ assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+ metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ }
+
+ //TIKA-1010 test regular (not "embedded") images/picts
+ @Test
+ public void testRegularImages() throws Exception {
+ Parser base = new AutoDetectParser();
+ ParseContext ctx = new ParseContext();
+ RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+ ctx.set(org.apache.tika.parser.Parser.class, parser);
+ ContentHandler handler = new BodyContentHandler();
+ Metadata rootMetadata = new Metadata();
+ rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
+ try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
+ parser.parse(tis, handler, rootMetadata, ctx);
+ }
+ List<Metadata> metadatas = parser.getMetadata();
+
+ Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
+ Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+
+ assertTrue(meta_jpg_exif != null);
+ assertTrue(meta_jpg != null);
+ // had to comment these out (when moving from 1.x to 2.x
+ // because AutoDetectParser within this module does not include image parsing.
+
+// assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
+// assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
+ //make sure old metadata doesn't linger between objects
+// assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
+ assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
+ assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
+
+ assertEquals(25, meta_jpg.names().length);
+ assertEquals(25, meta_jpg_exif.names().length);
+ }
+
+ @Test
+ public void testMultipleNewlines() throws Exception {
+ String content = getXML("testRTFNewlines.rtf").xml;
+ content = content.replaceAll("[\r\n]+", " ");
+ assertContains("<body><p>one</p> " +
+ "<p /> " +
+ "<p>two</p> " +
+ "<p /> " +
+ "<p /> " +
+ "<p>three</p> " +
+ "<p /> " +
+ "<p /> " +
+ "<p /> " +
+ "<p>four</p>", content);
+ }
+
+ //TIKA-1010 test linked embedded doc
+ @Test
+ public void testEmbeddedLinkedDocument() throws Exception {
+ Set<MediaType> skipTypes = new HashSet<MediaType>();
+ skipTypes.add(MediaType.parse("application/x-emf"));
+ skipTypes.add(MediaType.parse("application/x-msmetafile"));
+
+ TrackingHandler tracker = new TrackingHandler(skipTypes);
+ try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+ ContainerExtractor ex = new ParserContainerExtractor();
+ assertEquals(true, ex.isSupported(tis));
+ ex.extract(tis, ex, tracker);
+ }
+ //should gracefully skip link and not throw NPE, IOEx, etc
+ assertEquals(0, tracker.filenames.size());
+
+ tracker = new TrackingHandler();
+ try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
+ ContainerExtractor ex = new ParserContainerExtractor();
+ assertEquals(true, ex.isSupported(tis));
+ ex.extract(tis, ex, tracker);
+ }
+ //should gracefully skip link and not throw NPE, IOEx, etc
+ assertEquals(2, tracker.filenames.size());
+ }
+
+ private static class Pair {
+ final String fileName;
+ final String mimeType;
+ Pair(String fileName, String mimeType) {
+ this.fileName = fileName;
+ this.mimeType = mimeType;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml
index 8d1238d..2feb22b 100644
--- a/tika-parser-modules/tika-parser-package-module/pom.xml
+++ b/tika-parser-modules/tika-parser-package-module/pom.xml
@@ -1,79 +1,79 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-package-module</artifactId>
- <name>Apache Tika parser package module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <!-- NOTE: sync tukaani version with commons-compress -->
- <tukaani.version>1.5</tukaani.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.tukaani</groupId>
- <artifactId>xz</artifactId>
- <version>${tukaani.version}</version>
- </dependency>
- <dependency>
- <groupId>com.github.junrar</groupId>
- <artifactId>junrar</artifactId>
- <version>0.7</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-compress</artifactId>
- <version>${commons.compress.version}</version>
- </dependency>
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-package-module</artifactId>
+ <name>Apache Tika parser package module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <!-- NOTE: sync tukaani version with commons-compress -->
+ <tukaani.version>1.5</tukaani.version>
+ </properties>
+
+ <dependencies>
<dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.tukaani</groupId>
+ <artifactId>xz</artifactId>
+ <version>${tukaani.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.github.junrar</groupId>
+ <artifactId>junrar</artifactId>
+ <version>0.7</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>${commons.compress.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>${codec.version}</version>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
</dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
index 0fb71fa..2345029 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/module/pkg/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pkg.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pkg.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
index 32f0126..4143932 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
@@ -1,112 +1,112 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.util.Locale;
-
-/**
- * Utility class to allow for conversion from an integer to Roman numerals
- * or alpha-numeric symbols in line with Pages auto numbering formats.
- */
- class AutoPageNumberUtils {
-
- private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
- "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
- "U", "V", "W", "X", "Y", "Z" };
-
- private static final int MAX = 26;
-
- public static String asAlphaNumeric(int i) {
- StringBuffer sbuff = new StringBuffer();
- int index = i % MAX;
- int ratio = i / MAX;
-
- if (index == 0) {
- ratio--;
- index = MAX;
- }
-
- for(int j = 0; j <= ratio; j++) {
- sbuff.append(ALPHABET[index - 1]); }
- return sbuff.toString();
- }
-
- public static String asAlphaNumericLower(int i) {
- return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
- }
-
- /*
- * Code copied from jena.apache.org.
- * @see com.hp.hpl.jena.sparql.util.RomanNumeral
- */
- public static String asRomanNumerals(int i) {
- if ( i <= 0 )
- throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
- if ( i > 3999 )
- throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
- StringBuffer sbuff = new StringBuffer() ;
-
- i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
- i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ;
- i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ;
-
- while ( i >= 1 )
- {
- sbuff.append("I") ;
- i -= 1 ;
- }
- return sbuff.toString() ;
-
-
- }
-
- public static String asRomanNumeralsLower(int i) {
- return asRomanNumerals(i).toLowerCase(Locale.ROOT);
- }
-
- private static int i2r(StringBuffer sbuff, int i,
- String tens, int iTens,
- String nines, int iNines,
- String fives, int iFives,
- String fours, int iFours)
- {
- while ( i >= iTens )
- {
- sbuff.append(tens) ;
- i -= iTens ;
- }
-
- if ( i >= iNines )
- {
- sbuff.append(nines) ;
- i -= iNines;
- }
-
- if ( i >= iFives )
- {
- sbuff.append(fives) ;
- i -= iFives ;
- }
- if ( i >= iFours )
- {
- sbuff.append(fours) ;
- i -= iFours ;
- }
- return i ;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.util.Locale;
+
+/**
+ * Utility class to allow for conversion from an integer to Roman numerals
+ * or alpha-numeric symbols in line with Pages auto numbering formats.
+ */
+ class AutoPageNumberUtils {
+
+ private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
+ "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
+ "U", "V", "W", "X", "Y", "Z" };
+
+ private static final int MAX = 26;
+
+ public static String asAlphaNumeric(int i) {
+ StringBuffer sbuff = new StringBuffer();
+ int index = i % MAX;
+ int ratio = i / MAX;
+
+ if (index == 0) {
+ ratio--;
+ index = MAX;
+ }
+
+ for(int j = 0; j <= ratio; j++) {
+ sbuff.append(ALPHABET[index - 1]); }
+ return sbuff.toString();
+ }
+
+ public static String asAlphaNumericLower(int i) {
+ return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
+ }
+
+ /*
+ * Code copied from jena.apache.org.
+ * @see com.hp.hpl.jena.sparql.util.RomanNumeral
+ */
+ public static String asRomanNumerals(int i) {
+ if ( i <= 0 )
+ throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+ if ( i > 3999 )
+ throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
+ StringBuffer sbuff = new StringBuffer() ;
+
+ i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
+ i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ;
+ i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ;
+
+ while ( i >= 1 )
+ {
+ sbuff.append("I") ;
+ i -= 1 ;
+ }
+ return sbuff.toString() ;
+
+
+ }
+
+ public static String asRomanNumeralsLower(int i) {
+ return asRomanNumerals(i).toLowerCase(Locale.ROOT);
+ }
+
+ private static int i2r(StringBuffer sbuff, int i,
+ String tens, int iTens,
+ String nines, int iNines,
+ String fives, int iFives,
+ String fours, int iFours)
+ {
+ while ( i >= iTens )
+ {
+ sbuff.append(tens) ;
+ i -= iTens ;
+ }
+
+ if ( i >= iNines )
+ {
+ sbuff.append(nines) ;
+ i -= iNines;
+ }
+
+ if ( i >= iFives )
+ {
+ sbuff.append(fives) ;
+ i -= iFives ;
+ }
+ if ( i >= iFours )
+ {
+ sbuff.append(fours) ;
+ i -= iFours ;
+ }
+ return i ;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 1861931..79d82e8 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -1,219 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.xml.namespace.QName;
-
-import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.detect.XmlRootExtractor;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
- * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
- *
- * Currently supported formats:
- * <ol>
- * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
- * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
- * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
- * </ol>
- */
-public class IWorkPackageParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -2160322853809682372L;
-
- /**
- * Which files within an iWork file contain the actual content?
- */
- public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
- new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
- );
- /**
- * All iWork files contain one of these, so we can detect based on it
- */
- public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
-
- public enum IWORKDocumentType {
- KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
- NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
- PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
- ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
-
- private final String namespace;
- private final String part;
- private final MediaType type;
-
- IWORKDocumentType(String namespace, String part, MediaType type) {
- this.namespace = namespace;
- this.part = part;
- this.type = type;
- }
-
- public String getNamespace() {
- return namespace;
- }
-
- public String getPart() {
- return part;
- }
-
- public MediaType getType() {
- return type;
- }
-
- public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
- try {
- if (entry == null) {
- return null;
- }
-
- try (InputStream stream = zip.getInputStream(entry)) {
- return detectType(stream);
- }
- } catch (IOException e) {
- return null;
- }
- }
-
- public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
- if (entry == null) {
- return null;
- }
-
- return detectType(zip);
- }
-
- private static IWORKDocumentType detectType(InputStream stream) {
- QName qname = new XmlRootExtractor().extractRootElement(stream);
- if (qname != null) {
- String uri = qname.getNamespaceURI();
- String local = qname.getLocalPart();
-
- for (IWORKDocumentType type : values()) {
- if(type.getNamespace().equals(uri) &&
- type.getPart().equals(local)) {
- return type;
- }
- }
- } else {
- // There was a problem with extracting the root type
- // Password Protected iWorks files are funny, but we can usually
- // spot them because they encrypt part of the zip stream
- try {
- stream.read();
- } catch(UnsupportedZipFeatureException e) {
- // Compression field was likely encrypted
- return ENCRYPTED;
- } catch(Exception ignored) {
- }
- }
- return null;
- }
- }
-
- /**
- * This parser handles all iWorks formats.
- */
- private final static Set<MediaType> supportedTypes =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.apple.iwork"),
- IWORKDocumentType.KEYNOTE.getType(),
- IWORKDocumentType.NUMBERS.getType(),
- IWORKDocumentType.PAGES.getType()
- )));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return supportedTypes;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
- ZipArchiveEntry entry = zip.getNextZipEntry();
-
- while (entry != null) {
- if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
- entry = zip.getNextZipEntry();
- continue;
- }
-
- InputStream entryStream = new BufferedInputStream(zip, 4096);
- entryStream.mark(4096);
- IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
- entryStream.reset();
-
- if(type != null) {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- ContentHandler contentHandler;
-
- switch(type) {
- case KEYNOTE:
- contentHandler = new KeynoteContentHandler(xhtml, metadata);
- break;
- case NUMBERS:
- contentHandler = new NumbersContentHandler(xhtml, metadata);
- break;
- case PAGES:
- contentHandler = new PagesContentHandler(xhtml, metadata);
- break;
- case ENCRYPTED:
- // We can't do anything for the file right now
- contentHandler = null;
- break;
- default:
- throw new TikaException("Unhandled iWorks file " + type);
- }
-
- metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
- xhtml.startDocument();
- if (contentHandler != null) {
- context.getSAXParser().parse(
- new CloseShieldInputStream(entryStream),
- new OfflineContentHandler(contentHandler)
- );
- }
- xhtml.endDocument();
- }
-
- entry = zip.getNextZipEntry();
- }
- // Don't close the zip InputStream (TIKA-1117).
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.namespace.QName;
+
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.XmlRootExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
+ * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
+ *
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
+ * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
+ * </ol>
+ */
+public class IWorkPackageParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -2160322853809682372L;
+
+ /**
+ * Which files within an iWork file contain the actual content?
+ */
+ public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
+ new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
+ );
+ /**
+ * All iWork files contain one of these, so we can detect based on it
+ */
+ public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
+
+ public enum IWORKDocumentType {
+ KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
+ NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
+ PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+ ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
+
+ private final String namespace;
+ private final String part;
+ private final MediaType type;
+
+ IWORKDocumentType(String namespace, String part, MediaType type) {
+ this.namespace = namespace;
+ this.part = part;
+ this.type = type;
+ }
+
+ public String getNamespace() {
+ return namespace;
+ }
+
+ public String getPart() {
+ return part;
+ }
+
+ public MediaType getType() {
+ return type;
+ }
+
+ public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
+ try {
+ if (entry == null) {
+ return null;
+ }
+
+ try (InputStream stream = zip.getInputStream(entry)) {
+ return detectType(stream);
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
+ if (entry == null) {
+ return null;
+ }
+
+ return detectType(zip);
+ }
+
+ private static IWORKDocumentType detectType(InputStream stream) {
+ QName qname = new XmlRootExtractor().extractRootElement(stream);
+ if (qname != null) {
+ String uri = qname.getNamespaceURI();
+ String local = qname.getLocalPart();
+
+ for (IWORKDocumentType type : values()) {
+ if(type.getNamespace().equals(uri) &&
+ type.getPart().equals(local)) {
+ return type;
+ }
+ }
+ } else {
+ // There was a problem with extracting the root type
+ // Password Protected iWorks files are funny, but we can usually
+ // spot them because they encrypt part of the zip stream
+ try {
+ stream.read();
+ } catch(UnsupportedZipFeatureException e) {
+ // Compression field was likely encrypted
+ return ENCRYPTED;
+ } catch(Exception ignored) {
+ }
+ }
+ return null;
+ }
+ }
+
+ /**
+ * This parser handles all iWorks formats.
+ */
+ private final static Set<MediaType> supportedTypes =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.apple.iwork"),
+ IWORKDocumentType.KEYNOTE.getType(),
+ IWORKDocumentType.NUMBERS.getType(),
+ IWORKDocumentType.PAGES.getType()
+ )));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+ ZipArchiveEntry entry = zip.getNextZipEntry();
+
+ while (entry != null) {
+ if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
+ entry = zip.getNextZipEntry();
+ continue;
+ }
+
+ InputStream entryStream = new BufferedInputStream(zip, 4096);
+ entryStream.mark(4096);
+ IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+ entryStream.reset();
+
+ if(type != null) {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ ContentHandler contentHandler;
+
+ switch(type) {
+ case KEYNOTE:
+ contentHandler = new KeynoteContentHandler(xhtml, metadata);
+ break;
+ case NUMBERS:
+ contentHandler = new NumbersContentHandler(xhtml, metadata);
+ break;
+ case PAGES:
+ contentHandler = new PagesContentHandler(xhtml, metadata);
+ break;
+ case ENCRYPTED:
+ // We can't do anything for the file right now
+ contentHandler = null;
+ break;
+ default:
+ throw new TikaException("Unhandled iWorks file " + type);
+ }
+
+ metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
+ xhtml.startDocument();
+ if (contentHandler != null) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(entryStream),
+ new OfflineContentHandler(contentHandler)
+ );
+ }
+ xhtml.endDocument();
+ }
+
+ entry = zip.getNextZipEntry();
+ }
+ // Don't close the zip InputStream (TIKA-1117).
+ }
+
+}
[11/39] tika git commit: Convert new lines from windows to unix
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
index dcc6508..6cda282 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
@@ -1,466 +1,466 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.iwork;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Tests if the IWork parser parses the content and metadata properly of the supported formats.
- */
-public class IWorkParserTest {
-
- private IWorkPackageParser iWorkParser;
- private ParseContext parseContext;
-
- @Before
- public void setUp() {
- iWorkParser = new IWorkPackageParser();
- parseContext = new ParseContext();
- parseContext.set(Parser.class, new AutoDetectParser());
- }
-
- /**
- * Check the given InputStream is not closed by the Parser (TIKA-1117).
- *
- * @throws Exception
- */
- @Test
- public void testStreamNotClosed() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
- input.read(); // Will throw an Exception if the stream was already closed.
- }
-
- @Test
- public void testParseKeynote() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- // Make sure enough keys came through
- // (Exact numbers will vary based on composites)
- assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
- List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
-// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-
- // Check the metadata values
- assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
- assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
- assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
- assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
-
- String content = handler.toString();
- assertContains("A sample presentation", content);
- assertContains("For the Apache Tika project", content);
- assertContains("Slide 1", content);
- assertContains("Some random text for the sake of testability.", content);
- assertContains("A nice comment", content);
- assertContains("A nice note", content);
-
- // test table data
- assertContains("Cell one", content);
- assertContains("Cell two", content);
- assertContains("Cell three", content);
- assertContains("Cell four", content);
- assertContains("Cell 5", content);
- assertContains("Cell six", content);
- assertContains("7", content);
- assertContains("Cell eight", content);
- assertContains("5/5/1985", content);
- }
-
- // TIKA-910
- @Test
- public void testKeynoteTextBoxes() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
- }
-
- // TIKA-910
- @Test
- public void testKeynoteBulletPoints() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
- }
-
- // TIKA-923
- @Test
- public void testKeynoteTables() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- content = content.replaceAll("\\s+", " ");
- assertContains("row 1 row 2 row 3", content);
- }
-
- // TIKA-923
- @Test
- public void testKeynoteMasterSlideTable() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- content = content.replaceAll("\\s+", " ");
- assertContains("master row 1", content);
- assertContains("master row 2", content);
- assertContains("master row 3", content);
- }
-
- @Test
- public void testParsePages() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- // Make sure enough keys came through
- // (Exact numbers will vary based on composites)
- assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
- List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
-
- // Check the metadata values
- assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
- assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
- assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
- assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
-
- String content = handler.toString();
-
- // text on page 1
- assertContains("Sample pages document", content);
- assertContains("Some plain text to parse.", content);
- assertContains("Cell one", content);
- assertContains("Cell two", content);
- assertContains("Cell three", content);
- assertContains("Cell four", content);
- assertContains("Cell five", content);
- assertContains("Cell six", content);
- assertContains("Cell seven", content);
- assertContains("Cell eight", content);
- assertContains("Cell nine", content);
- assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
-
- // text on page 2
- assertContains("A second page....", content);
- assertContains("Extensible Markup Language", content); // ...
- }
-
- // TIKA-904
- @Test
- public void testPagesLayoutMode() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- assertContains("text box 1 - here is some text", content);
- assertContains("created in a text box in layout mode", content);
- assertContains("text box 2 - more text!@!$@#", content);
- assertContains("this is text inside of a green box", content);
- assertContains("text inside of a green circle", content);
- }
-
- @Test
- public void testParseNumbers() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- // Make sure enough keys came through
- // (Exact numbers will vary based on composites)
- assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
- List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-
- // Check the metadata values
- assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
- assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
-
- String content = handler.toString();
- assertContains("Category", content);
- assertContains("Home", content);
- assertContains("-226", content);
- assertContains("-137.5", content);
- assertContains("Checking Account: 300545668", content);
- assertContains("4650", content);
- assertContains("Credit Card", content);
- assertContains("Groceries", content);
- assertContains("-210", content);
- assertContains("Food", content);
- assertContains("Try adding your own account transactions to this table.", content);
- }
-
- // TIKA- 924
- @Test
- public void testParseNumbersTableNames() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
- String content = handler.toString();
- assertContains("This is the main table", content);
- }
-
- @Test
- public void testParseNumbersTableHeaders() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- String content = handler.toString();
- for(int header=1;header<=5;header++) {
- assertContains("header" + header, content);
- }
- for(int row=1;row<=3;row++) {
- assertContains("row" + row, content);
- }
- }
-
- /**
- * We don't currently support password protected Pages files, as
- * we don't know how the encryption works (it's not regular Zip
- * Encryption). See TIKA-903 for details
- */
- @Test
- public void testParsePagesPasswordProtected() throws Exception {
- // Document password is "tika", but we can't use that yet...
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, metadata, parseContext);
-
- // Content will be empty
- String content = handler.toString();
- assertEquals("", content);
-
- // Will have been identified as encrypted
- assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
- }
-
- /**
- * Check we get headers, footers and footnotes from Pages
- */
- @Test
- public void testParsePagesHeadersFootersFootnotes() throws Exception {
- String footnote = "Footnote: Do a lot of people really use iWork?!?!";
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\t1";
- String footer2 = "THIS IS SOME FOOTER TEXT\t2";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, metadata, parseContext);
- String contents = handler.toString();
-
- // Check regular text
- assertContains("Both Pages 1.x", contents); // P1
- assertContains("understanding the Pages document", contents); // P1
- assertContains("should be page 2", contents); // P2
-
- // Check for headers, footers and footnotes
- assertContains(header, contents);
- assertContains(footer, contents);
- assertContains(footer2, contents);
- assertContains(footnote, contents);
- }
-
- /**
- * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
- */
- @Test
- public void testParsePagesHeadersFootersRomanUpper() throws Exception {
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\tI";
- String footer2 = "THIS IS SOME FOOTER TEXT\tII";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, new Metadata(), parseContext);
- String contents = handler.toString();
-
- // Check for headers, footers and footnotes
- assertContains(header, contents);
- assertContains(footer, contents);
- assertContains(footer2, contents);
- }
-
- /**
- * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
- */
- @Test
- public void testParsePagesHeadersFootersRomanLower() throws Exception {
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\ti";
- String footer2 = "THIS IS SOME FOOTER TEXT\tii";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, new Metadata(), parseContext);
- String contents = handler.toString();
-
- // Check for headers, footers and footnotes
- assertContains(header, contents);
- assertContains(footer, contents);
- assertContains(footer2, contents);
- }
-
- /**
- * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
- */
- @Test
- public void testParsePagesHeadersAlphaUpper() throws Exception {
- String header = "THIS IS SOME HEADER TEXT\tA";
- String footer = "THIS IS SOME FOOTER TEXT\tA";
- String footer2 = "THIS IS SOME FOOTER TEXT\tB";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, new Metadata(), parseContext);
- String contents = handler.toString();
-
- // Check for headers, footers and footnotes
- assertContains(header, contents);
- assertContains(footer, contents);
- assertContains(footer2, contents);
- }
-
- /**
- * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
- */
- @Test
- public void testParsePagesHeadersAlphaLower() throws Exception {
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\ta";
- String footer2 = "THIS IS SOME FOOTER TEXT\tb";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, new Metadata(), parseContext);
- String contents = handler.toString();
-
- // Check for headers, footers and footnotes
- assertContains(header, contents);
- assertContains(footer, contents);
- assertContains(footer2, contents);
- }
-
- /**
- * Check we get annotations (eg comments) from Pages
- */
- @Test
- public void testParsePagesAnnotations() throws Exception {
- String commentA = "comment about the APXL file";
- String commentB = "comment about UIMA";
-
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
-
- iWorkParser.parse(input, handler, metadata, parseContext);
- String contents = handler.toString();
-
- // Check regular text
- assertContains("Both Pages 1.x", contents); // P1
- assertContains("understanding the Pages document", contents); // P1
- assertContains("should be page 2", contents); // P2
-
- // Check for comments
- assertContains(commentA, contents);
- assertContains(commentB, contents);
- }
-
- // TIKA-918
- @Test
- public void testNumbersExtractChartNames() throws Exception {
- InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- iWorkParser.parse(input, handler, metadata, parseContext);
- String contents = handler.toString();
- assertContains("Expenditure by Category", contents);
- assertContains("Currency Chart name", contents);
- assertContains("Chart 2", contents);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests if the IWork parser parses the content and metadata properly of the supported formats.
+ */
+public class IWorkParserTest {
+
+ private IWorkPackageParser iWorkParser;
+ private ParseContext parseContext;
+
+ @Before
+ public void setUp() {
+ iWorkParser = new IWorkPackageParser();
+ parseContext = new ParseContext();
+ parseContext.set(Parser.class, new AutoDetectParser());
+ }
+
+ /**
+ * Check the given InputStream is not closed by the Parser (TIKA-1117).
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testStreamNotClosed() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ input.read(); // Will throw an Exception if the stream was already closed.
+ }
+
+ @Test
+ public void testParseKeynote() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
+// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
+ // Check the metadata values
+ assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
+ assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+ assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+ assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+
+ String content = handler.toString();
+ assertContains("A sample presentation", content);
+ assertContains("For the Apache Tika project", content);
+ assertContains("Slide 1", content);
+ assertContains("Some random text for the sake of testability.", content);
+ assertContains("A nice comment", content);
+ assertContains("A nice note", content);
+
+ // test table data
+ assertContains("Cell one", content);
+ assertContains("Cell two", content);
+ assertContains("Cell three", content);
+ assertContains("Cell four", content);
+ assertContains("Cell 5", content);
+ assertContains("Cell six", content);
+ assertContains("7", content);
+ assertContains("Cell eight", content);
+ assertContains("5/5/1985", content);
+ }
+
+ // TIKA-910
+ @Test
+ public void testKeynoteTextBoxes() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
+ }
+
+ // TIKA-910
+ @Test
+ public void testKeynoteBulletPoints() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
+ }
+
+ // TIKA-923
+ @Test
+ public void testKeynoteTables() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("row 1 row 2 row 3", content);
+ }
+
+ // TIKA-923
+ @Test
+ public void testKeynoteMasterSlideTable() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("master row 1", content);
+ assertContains("master row 2", content);
+ assertContains("master row 3", content);
+ }
+
+ @Test
+ public void testParsePages() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
+
+ // Check the metadata values
+ assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+
+ String content = handler.toString();
+
+ // text on page 1
+ assertContains("Sample pages document", content);
+ assertContains("Some plain text to parse.", content);
+ assertContains("Cell one", content);
+ assertContains("Cell two", content);
+ assertContains("Cell three", content);
+ assertContains("Cell four", content);
+ assertContains("Cell five", content);
+ assertContains("Cell six", content);
+ assertContains("Cell seven", content);
+ assertContains("Cell eight", content);
+ assertContains("Cell nine", content);
+ assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
+
+ // text on page 2
+ assertContains("A second page....", content);
+ assertContains("Extensible Markup Language", content); // ...
+ }
+
+ // TIKA-904
+ @Test
+ public void testPagesLayoutMode() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertContains("text box 1 - here is some text", content);
+ assertContains("created in a text box in layout mode", content);
+ assertContains("text box 2 - more text!@!$@#", content);
+ assertContains("this is text inside of a green box", content);
+ assertContains("text inside of a green circle", content);
+ }
+
+ @Test
+ public void testParseNumbers() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
+ // Check the metadata values
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
+
+ String content = handler.toString();
+ assertContains("Category", content);
+ assertContains("Home", content);
+ assertContains("-226", content);
+ assertContains("-137.5", content);
+ assertContains("Checking Account: 300545668", content);
+ assertContains("4650", content);
+ assertContains("Credit Card", content);
+ assertContains("Groceries", content);
+ assertContains("-210", content);
+ assertContains("Food", content);
+ assertContains("Try adding your own account transactions to this table.", content);
+ }
+
+ // TIKA- 924
+ @Test
+ public void testParseNumbersTableNames() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String content = handler.toString();
+ assertContains("This is the main table", content);
+ }
+
+ @Test
+ public void testParseNumbersTableHeaders() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ for(int header=1;header<=5;header++) {
+ assertContains("header" + header, content);
+ }
+ for(int row=1;row<=3;row++) {
+ assertContains("row" + row, content);
+ }
+ }
+
+ /**
+ * We don't currently support password protected Pages files, as
+ * we don't know how the encryption works (it's not regular Zip
+ * Encryption). See TIKA-903 for details
+ */
+ @Test
+ public void testParsePagesPasswordProtected() throws Exception {
+ // Document password is "tika", but we can't use that yet...
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Content will be empty
+ String content = handler.toString();
+ assertEquals("", content);
+
+ // Will have been identified as encrypted
+ assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Check we get headers, footers and footnotes from Pages
+ */
+ @Test
+ public void testParsePagesHeadersFootersFootnotes() throws Exception {
+ String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\t1";
+ String footer2 = "THIS IS SOME FOOTER TEXT\t2";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains("Both Pages 1.x", contents); // P1
+ assertContains("understanding the Pages document", contents); // P1
+ assertContains("should be page 2", contents); // P2
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ assertContains(footnote, contents);
+ }
+
+ /**
+ * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersFootersRomanUpper() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\tI";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tII";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersFootersRomanLower() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\ti";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tii";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersAlphaUpper() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT\tA";
+ String footer = "THIS IS SOME FOOTER TEXT\tA";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tB";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersAlphaLower() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\ta";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tb";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get annotations (eg comments) from Pages
+ */
+ @Test
+ public void testParsePagesAnnotations() throws Exception {
+ String commentA = "comment about the APXL file";
+ String commentB = "comment about UIMA";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains("Both Pages 1.x", contents); // P1
+ assertContains("understanding the Pages document", contents); // P1
+ assertContains("should be page 2", contents); // P2
+
+ // Check for comments
+ assertContains(commentA, contents);
+ assertContains(commentB, contents);
+ }
+
+ // TIKA-918
+ @Test
+ public void testNumbersExtractChartNames() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+ assertContains("Expenditure by Category", contents);
+ assertContains("Currency Chart name", contents);
+ assertContains("Chart 2", contents);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
index 95bd87c..6fad531 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
@@ -1,93 +1,93 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Before;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parent class for all Package based Test cases
- */
-public abstract class AbstractPkgTest extends TikaTest {
- protected ParseContext trackingContext;
- protected ParseContext recursingContext;
-
- protected Parser autoDetectParser;
- protected EmbeddedTrackingParser tracker;
-
- @Before
- public void setUp() throws Exception {
- tracker = new EmbeddedTrackingParser();
- trackingContext = new ParseContext();
- trackingContext.set(Parser.class, tracker);
-
- autoDetectParser = new AutoDetectParser();
- recursingContext = new ParseContext();
- recursingContext.set(Parser.class, autoDetectParser);
- }
-
-
- @SuppressWarnings("serial")
- protected static class EmbeddedTrackingParser extends AbstractParser {
- protected List<String> filenames = new ArrayList<String>();
- protected List<String> mediatypes = new ArrayList<String>();
- protected List<String> createdAts = new ArrayList<String>();
- protected List<String> modifiedAts = new ArrayList<String>();
- protected byte[] lastSeenStart;
-
- public void reset() {
- filenames.clear();
- mediatypes.clear();
- createdAts.clear();
- modifiedAts.clear();
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- // Cheat!
- return (new AutoDetectParser()).getSupportedTypes(context);
- }
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
- mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
- createdAts.add(metadata.get(TikaCoreProperties.CREATED));
- modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
-
- lastSeenStart = new byte[32];
- stream.read(lastSeenStart);
- }
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Before;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parent class for all Package based Test cases
+ */
+public abstract class AbstractPkgTest extends TikaTest {
+ protected ParseContext trackingContext;
+ protected ParseContext recursingContext;
+
+ protected Parser autoDetectParser;
+ protected EmbeddedTrackingParser tracker;
+
+ @Before
+ public void setUp() throws Exception {
+ tracker = new EmbeddedTrackingParser();
+ trackingContext = new ParseContext();
+ trackingContext.set(Parser.class, tracker);
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+
+ @SuppressWarnings("serial")
+ protected static class EmbeddedTrackingParser extends AbstractParser {
+ protected List<String> filenames = new ArrayList<String>();
+ protected List<String> mediatypes = new ArrayList<String>();
+ protected List<String> createdAts = new ArrayList<String>();
+ protected List<String> modifiedAts = new ArrayList<String>();
+ protected byte[] lastSeenStart;
+
+ public void reset() {
+ filenames.clear();
+ mediatypes.clear();
+ createdAts.clear();
+ modifiedAts.clear();
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // Cheat!
+ return (new AutoDetectParser()).getSupportedTypes(context);
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+ createdAts.add(metadata.get(TikaCoreProperties.CREATED));
+ modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
+
+ lastSeenStart = new byte[32];
+ stream.read(lastSeenStart);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
index 3dc01f6..42b60da 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
@@ -1,89 +1,89 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing bzip2 files.
- */
-public class Bzip2ParserTest extends AbstractPkgTest {
-
- @Test
- public void testBzip2Parsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("test-documents/testXML.xml", content);
- }
-
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- parser.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing bzip2 files.
+ */
+public class Bzip2ParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.createdAts.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index 0439a38..378a0fc 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing gzip files.
- */
-public class GzipParserTest extends AbstractPkgTest {
-
- @Test
- public void testGzipParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = GzipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tgz")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("test-documents/testXML.xml", content);
- }
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tgz")) {
- parser.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
-
- @Test
- public void testSvgzParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = GzipParserTest.class.getResourceAsStream(
- "/test-documents/testSVG.svgz")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing gzip files.
+ */
+public class GzipParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+ }
+
+ @Test
+ public void testSvgzParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+ "/test-documents/testSVG.svgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
index 95126ed..35ab265 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
@@ -1,105 +1,105 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing tar files.
- */
-public class TarParserTest extends AbstractPkgTest {
-
- @Test
- public void testTarParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = TarParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tar")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("test-documents/testXML.xml", content);
- }
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tar")) {
- parser.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found all 9 documents, but not the directory
- assertEquals(9, tracker.filenames.size());
- assertEquals(9, tracker.mediatypes.size());
- assertEquals(9, tracker.modifiedAts.size());
-
- // Should have names but not content types, as tar doesn't
- // store the content types
- assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
- assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
- assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
- assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
- assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
- assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
- assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
- assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
- assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
-
- for(String type : tracker.mediatypes) {
- assertNull(type);
- }
- for(String crt : tracker.createdAts) {
- assertNull(crt);
- }
- for(String mod : tracker.modifiedAts) {
- assertNotNull(mod);
- assertTrue("Modified at " + mod, mod.startsWith("20"));
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing tar files.
+ */
+public class TarParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = TarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names but not content types, as tar doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+}