You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:16 UTC
[10/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 01dd436..30f9c98 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -1,192 +1,192 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.tika.Tika;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends AbstractPkgTest {
-
- @Test
- public void testZipParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testEXCEL.xls", content);
- assertContains("testHTML.html", content);
- assertContains("testOpenOffice2.odt", content);
- assertContains("testPDF.pdf", content);
- assertContains("testPPT.ppt", content);
- assertContains("testRTF.rtf", content);
- assertContains("testTXT.txt", content);
- assertContains("testWORD.doc", content);
- assertContains("testXML.xml", content);
- }
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- parser.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found all 9 documents
- assertEquals(9, tracker.filenames.size());
- assertEquals(9, tracker.mediatypes.size());
- assertEquals(9, tracker.modifiedAts.size());
-
- // Should have names and modified dates, but not content types,
- // as zip doesn't store the content types
- assertEquals("testEXCEL.xls", tracker.filenames.get(0));
- assertEquals("testHTML.html", tracker.filenames.get(1));
- assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
- assertEquals("testPDF.pdf", tracker.filenames.get(3));
- assertEquals("testPPT.ppt", tracker.filenames.get(4));
- assertEquals("testRTF.rtf", tracker.filenames.get(5));
- assertEquals("testTXT.txt", tracker.filenames.get(6));
- assertEquals("testWORD.doc", tracker.filenames.get(7));
- assertEquals("testXML.xml", tracker.filenames.get(8));
-
- for(String type : tracker.mediatypes) {
- assertNull(type);
- }
- for(String crt : tracker.createdAts) {
- assertNull(crt);
- }
- for(String mod : tracker.modifiedAts) {
- assertNotNull(mod);
- assertTrue("Modified at " + mod, mod.startsWith("20"));
- }
- }
-
- /**
- * Test case for the ability of the ZIP parser to extract the name of
- * a ZIP entry even if the content of the entry is unreadable due to an
- * unsupported compression method.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
- */
- @Test
- public void testUnsupportedZipCompressionMethod() throws Exception {
- String content = new Tika().parseToString(
- ZipParserTest.class.getResourceAsStream(
- "/test-documents/moby.zip"));
- assertContains("README", content);
- }
-
- private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
- public Set<String> allRelIDs = new HashSet<String>();
- public boolean shouldParseEmbedded(Metadata metadata) {
- String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
- if (relID != null) {
- allRelIDs.add(relID);
- }
- return false;
- }
-
- public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
- throw new UnsupportedOperationException("should never be called");
- }
- }
-
- // TIKA-1036
- @Test
- public void testPlaceholders() throws Exception {
- String xml = getXML("testEmbedded.zip").xml;
- assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
- assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
-
- // Also make sure EMBEDDED_RELATIONSHIP_ID was
- // passed when parsing the embedded docs:
- Parser parser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
- context.set(EmbeddedDocumentExtractor.class, relIDs);
- try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
- parser.parse(input,
- new BodyContentHandler(),
- new Metadata(),
- context);
- }
-
- assertTrue(relIDs.allRelIDs.contains("test1.txt"));
- assertTrue(relIDs.allRelIDs.contains("test2.txt"));
- }
-
- @Test // TIKA-936
- public void testCustomEncoding() throws Exception {
- ArchiveStreamFactory factory = new ArchiveStreamFactory();
- factory.setEntryEncoding("SJIS");
- trackingContext.set(ArchiveStreamFactory.class, factory);
-
- try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
- "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
- + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
- + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
- + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
- autoDetectParser.parse(
- stream, new DefaultHandler(),
- new Metadata(), trackingContext);
- }
-
- assertEquals(1, tracker.filenames.size());
- assertEquals(
- "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
- tracker.filenames.get(0));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.tika.Tika;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Test case for parsing zip files.
+ */
+public class ZipParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("testHTML.html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("testTXT.txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names and modified dates, but not content types,
+ // as zip doesn't store the content types
+ assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("testHTML.html", tracker.filenames.get(1));
+ assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("testTXT.txt", tracker.filenames.get(6));
+ assertEquals("testWORD.doc", tracker.filenames.get(7));
+ assertEquals("testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+
+ /**
+ * Test case for the ability of the ZIP parser to extract the name of
+ * a ZIP entry even if the content of the entry is unreadable due to an
+ * unsupported compression method.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
+ */
+ @Test
+ public void testUnsupportedZipCompressionMethod() throws Exception {
+ String content = new Tika().parseToString(
+ ZipParserTest.class.getResourceAsStream(
+ "/test-documents/moby.zip"));
+ assertContains("README", content);
+ }
+
+ private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
+ public Set<String> allRelIDs = new HashSet<String>();
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+ if (relID != null) {
+ allRelIDs.add(relID);
+ }
+ return false;
+ }
+
+ public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
+ throw new UnsupportedOperationException("should never be called");
+ }
+ }
+
+ // TIKA-1036
+ @Test
+ public void testPlaceholders() throws Exception {
+ String xml = getXML("testEmbedded.zip").xml;
+ assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+ assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
+
+ // Also make sure EMBEDDED_RELATIONSHIP_ID was
+ // passed when parsing the embedded docs:
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
+ context.set(EmbeddedDocumentExtractor.class, relIDs);
+ try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
+ parser.parse(input,
+ new BodyContentHandler(),
+ new Metadata(),
+ context);
+ }
+
+ assertTrue(relIDs.allRelIDs.contains("test1.txt"));
+ assertTrue(relIDs.allRelIDs.contains("test2.txt"));
+ }
+
+ @Test // TIKA-936
+ public void testCustomEncoding() throws Exception {
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
+ factory.setEntryEncoding("SJIS");
+ trackingContext.set(ArchiveStreamFactory.class, factory);
+
+ try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
+ "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
+ + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
+ + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
+ + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
+ autoDetectParser.parse(
+ stream, new DefaultHandler(),
+ new Metadata(), trackingContext);
+ }
+
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(
+ "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
+ tracker.filenames.get(0));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index 11f259e..568303c 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -1,126 +1,126 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-pdf-module</artifactId>
- <name>Apache Tika parser pdf module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <commons.logging.version>1.1.3</commons.logging.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-multimedia-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-xmp-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox-tools</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>jempbox</artifactId>
- <version>${jempbox.version}</version>
- </dependency>
- <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
- as optional, but we prefer to have them always to avoid
- problems with encrypted PDFs. -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcmail-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- <version>${bouncycastle.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <!-- Copied from PDFBox:
- For legal reasons (incompatible license), jai-imageio-core is to be used
- only in the tests and may not be distributed. See also LEGAL-195 -->
- <dependency>
- <groupId>com.github.jai-imageio</groupId>
- <artifactId>jai-imageio-core</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-pdf-module</artifactId>
+ <name>Apache Tika parser pdf module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-multimedia-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox-tools</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jempbox</artifactId>
+ <version>${jempbox.version}</version>
+ </dependency>
+ <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+ as optional, but we prefer to have them always to avoid
+ problems with encrypted PDFs. -->
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <!-- Copied from PDFBox:
+ For legal reasons (incompatible license), jai-imageio-core is to be used
+ only in the tests and may not be distributed. See also LEGAL-195 -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
index 9860934..d38a96d 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pdf.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.pdf.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/pom.xml b/tika-parser-modules/tika-parser-scientific-module/pom.xml
index 7afe2d6..1b3eb96 100644
--- a/tika-parser-modules/tika-parser-scientific-module/pom.xml
+++ b/tika-parser-modules/tika-parser-scientific-module/pom.xml
@@ -1,136 +1,136 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-scientific-module</artifactId>
- <name>Apache Tika parser scientific module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <netcdf-java.version>4.5.5</netcdf-java.version>
- <sis.version>0.6</sis.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-exec</artifactId>
- <version>${commons.exec}</version>
- </dependency>
- <dependency>
- <groupId>com.googlecode.json-simple</groupId>
- <artifactId>json-simple</artifactId>
- <version>1.1.1</version>
- <exclusions>
- <exclusion>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-utility</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.storage</groupId>
- <artifactId>sis-netcdf</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-metadata</artifactId>
- <version>${sis.version}</version>
- </dependency>
- <!-- edu.ucar dependencies -->
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>netcdf4</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>grib</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>cdm</artifactId>
- <version>${netcdf-java.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>httpservices</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <!-- Apache cTAKES -->
- <dependency>
- <groupId>org.apache.ctakes</groupId>
- <artifactId>ctakes-core</artifactId>
- <version>3.2.2</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <!-- Upstream parser libraries -->
- <dependency>
- <groupId>net.sourceforge.jmatio</groupId>
- <artifactId>jmatio</artifactId>
- <version>1.0</version>
- </dependency>
- <!-- Apache Commons CSV -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-csv</artifactId>
- <version>1.0</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-scientific-module</artifactId>
+ <name>Apache Tika parser scientific module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <netcdf-java.version>4.5.5</netcdf-java.version>
+ <sis.version>0.6</sis.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-exec</artifactId>
+ <version>${commons.exec}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.json-simple</groupId>
+ <artifactId>json-simple</artifactId>
+ <version>1.1.1</version>
+ <exclusions>
+ <exclusion>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.core</groupId>
+ <artifactId>sis-utility</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.storage</groupId>
+ <artifactId>sis-netcdf</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.sis.core</groupId>
+ <artifactId>sis-metadata</artifactId>
+ <version>${sis.version}</version>
+ </dependency>
+ <!-- edu.ucar dependencies -->
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>netcdf4</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>grib</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>cdm</artifactId>
+ <version>${netcdf-java.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>httpservices</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <!-- Apache cTAKES -->
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <!-- Upstream parser libraries -->
+ <dependency>
+ <groupId>net.sourceforge.jmatio</groupId>
+ <artifactId>jmatio</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <!-- Apache Commons CSV -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
index 0195b63..741b64e 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/module/scientific/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.scientific.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.scientific.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
index 0a3121b..821493b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
@@ -1,122 +1,122 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.netcdf.NetCDFParser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import ucar.nc2.Attribute;
-import ucar.nc2.Group;
-import ucar.nc2.NetcdfFile;
-
-/**
- *
- * Since the {@link NetCDFParser} depends on the <a
- * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
- * we are able to use it to parse HDF files as well. See <a href=
- * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
- * >this link</a> for more information.
- */
-public class HDFParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 1091208208003437549L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("x-hdf"));
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
- * .tika.parser.ParseContext)
- */
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
- * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
- * org.apache.tika.parser.ParseContext)
- */
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- IOUtils.copy(stream, os);
-
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name == null) {
- name = "";
- }
- try {
- NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
- unravelStringMet(ncFile, null, metadata);
- } catch (IOException e) {
- throw new TikaException("HDF parse error", e);
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
- protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
- if (group == null) {
- group = ncFile.getRootGroup();
- }
-
- // get file type
- met.set("File-Type-Description", ncFile.getFileTypeDescription());
- // unravel its string attrs
- for (Attribute attribute : group.getAttributes()) {
- if (attribute.isString()) {
- met.add(attribute.getFullName(), attribute.getStringValue());
- } else {
- // try and cast its value to a string
- met.add(attribute.getFullName(), String.valueOf(attribute
- .getNumericValue()));
- }
- }
-
- for (Group g : group.getGroups()) {
- unravelStringMet(ncFile, g, met);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.netcdf.NetCDFParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import ucar.nc2.Attribute;
+import ucar.nc2.Group;
+import ucar.nc2.NetcdfFile;
+
+/**
+ *
+ * Since the {@link NetCDFParser} depends on the <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
+ * we are able to use it to parse HDF files as well. See <a href=
+ * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
+ * >this link</a> for more information.
+ */
+public class HDFParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 1091208208003437549L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-hdf"));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
+ * .tika.parser.ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ IOUtils.copy(stream, os);
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name == null) {
+ name = "";
+ }
+ try {
+ NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+ unravelStringMet(ncFile, null, metadata);
+ } catch (IOException e) {
+ throw new TikaException("HDF parse error", e);
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
+ if (group == null) {
+ group = ncFile.getRootGroup();
+ }
+
+ // get file type
+ met.set("File-Type-Description", ncFile.getFileTypeDescription());
+ // unravel its string attrs
+ for (Attribute attribute : group.getAttributes()) {
+ if (attribute.isString()) {
+ met.add(attribute.getFullName(), attribute.getStringValue());
+ } else {
+ // try and cast its value to a string
+ met.add(attribute.getFullName(), String.valueOf(attribute
+ .getNumericValue()));
+ }
+ }
+
+ for (Group g : group.getGroups()) {
+ unravelStringMet(ncFile, g, met);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
index 1ee4dc7..d54754b 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
@@ -1,72 +1,72 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.hdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.TikaTest;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- *
- * Test suite for the {@link HDFParser}.
- *
- */
-public class HDFParserTest extends TikaTest {
-
- @Test
- public void testParseGlobalMetadata() throws Exception {
- if(System.getProperty("java.version").startsWith("1.5")) {
- return;
- }
- /*
- * this is a publicly available HDF5 file from the MLS mission:
- *
- *
- * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
- * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
- */
-
- XMLResult r = getXML("test.he5", new HDFParser());
- assertNotNull(r.metadata);
- assertEquals("5", r.metadata.get("GranuleMonth"));
- }
-
- @Test
- public void testHDF4() throws Exception {
- if(System.getProperty("java.version").startsWith("1.5")) {
- return;
- }
-
- /*
- * this is a publicly available HDF4 file from the HD4 examples:
- *
- * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
- */
- XMLResult r = getXML("test.hdf", new HDFParser());
- assertNotNull(r.metadata);
- assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
- assertEquals("Ascending", r.metadata.get("Pass"));
- assertEquals("Hierarchical Data Format, version 4",
- r.metadata.get("File-Type-Description"));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.TikaTest;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ *
+ * Test suite for the {@link HDFParser}.
+ *
+ */
+public class HDFParserTest extends TikaTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+ /*
+ * this is a publicly available HDF5 file from the MLS mission:
+ *
+ *
+ * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
+ * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
+ */
+
+ XMLResult r = getXML("test.he5", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("5", r.metadata.get("GranuleMonth"));
+ }
+
+ @Test
+ public void testHDF4() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+
+ /*
+ * this is a publicly available HDF4 file from the HD4 examples:
+ *
+ * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
+ */
+ XMLResult r = getXML("test.hdf", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
+ assertEquals("Ascending", r.metadata.get("Pass"));
+ assertEquals("Hierarchical Data Format, version 4",
+ r.metadata.get("File-Type-Description"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
index 7d0f2e8..77a8cc8 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
@@ -1,61 +1,61 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.netcdf;
-
-//JDK imports
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-//TIKA imports
-
-/**
- * Test cases to exercise the {@link NetCDFParser}.
- */
-public class NetCDFParserTest extends TikaTest {
-
- @Test
- public void testParseGlobalMetadata() throws Exception {
-
- XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
- assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
- "model output prepared for IPCC AR4");
- assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
- assertEquals(r.metadata.get(Metadata.PROJECT_ID),
- "IPCC Fourth Assessment");
- assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
- assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
- assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
- "720 ppm stabilization experiment (SRESA1B)");
- assertEquals(r.metadata.get("File-Type-Description"),
- "NetCDF-3/CDM");
-
- assertContains("long_name = \"Surface area\"", r.xml);
- assertContains("float area(lat=128, lon=256)", r.xml);
- assertContains("float lat(lat=128)", r.xml);
- assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
- assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
-
-
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+//TIKA imports
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ */
+public class NetCDFParserTest extends TikaTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+
+ XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
+ assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
+ "model output prepared for IPCC AR4");
+ assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+ assertEquals(r.metadata.get(Metadata.PROJECT_ID),
+ "IPCC Fourth Assessment");
+ assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+ assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
+ assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
+ "720 ppm stabilization experiment (SRESA1B)");
+ assertEquals(r.metadata.get("File-Type-Description"),
+ "NetCDF-3/CDM");
+
+ assertContains("long_name = \"Surface area\"", r.xml);
+ assertContains("float area(lat=128, lon=256)", r.xml);
+ assertContains("float lat(lat=128)", r.xml);
+ assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
+ assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
+
+
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/pom.xml b/tika-parser-modules/tika-parser-text-module/pom.xml
index 1389d08..aca729b 100644
--- a/tika-parser-modules/tika-parser-text-module/pom.xml
+++ b/tika-parser-modules/tika-parser-text-module/pom.xml
@@ -1,67 +1,67 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-text-module</artifactId>
- <name>Apache Tika parser text module</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <commons.logging.version>1.1.3</commons.logging.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>com.googlecode.juniversalchardet</groupId>
- <artifactId>juniversalchardet</artifactId>
- <version>1.0.3</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>${codec.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>${commons.logging.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-text-module</artifactId>
+ <name>Apache Tika parser text module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.juniversalchardet</groupId>
+ <artifactId>juniversalchardet</artifactId>
+ <version>1.0.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
index 80716d8..59836c6 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/module/text/internal/Activator.java
@@ -1,20 +1,20 @@
-package org.apache.tika.module.text.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+package org.apache.tika.module.text.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}