You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [22/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,1377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TikaTest {
+
+    public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
+    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+    public static final MediaType TYPE_PDF = MediaType.application("pdf");
+    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public static final MediaType TYPE_DOC = MediaType.application("msword");
+    public static Level PDFBOX_LOG_LEVEL = Level.INFO;
+
+    @BeforeClass
+    public static void setup() {
+        //remember default logging level, but turn off for PDFParserTest
+        PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel();
+        Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
+    }
+
+    @AfterClass
+    public static void tearDown() {
+        //return to regular logging level
+        Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
+    }
+
+    private static int substringCount(String needle, String haystack) {
+        int upto = -1;
+        int count = 0;
+        while (true) {
+            final int next = haystack.indexOf(needle, upto);
+            if (next == -1) {
+                break;
+            }
+            count++;
+            upto = next + 1;
+        }
+
+        return count;
+    }
+
+    @Test
+    public void testPdfParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF.pdf");
+
+        String content = getText(stream, parser, metadata);
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
+        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+
+        // Can't reliably test dates yet - see TIKA-451
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
+
+        assertContains("Apache Tika", content);
+        assertContains("Tika - Content Analysis Toolkit", content);
+        assertContains("incubator", content);
+        assertContains("Apache Software Foundation", content);
+        // testing how the end of one paragraph is separated from start of the next one
+        assertTrue("should have word boundary after headline",
+                !content.contains("ToolkitApache"));
+        assertTrue("should have word boundary between paragraphs",
+                !content.contains("libraries.Apache"));
+    }
+
+    @Test
+    public void testPdfParsingMetadataOnly() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF.pdf")) {
+            parser.parse(stream, null, metadata, new ParseContext());
+        }
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testCustomMetadata() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF-custommetadata.pdf");
+
+        String content = getText(stream, parser, metadata);
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Document author", metadata.get(Metadata.AUTHOR));
+        assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("Custom Value", metadata.get("Custom Property"));
+
+        assertEquals("Array Entry 1", metadata.get("Custom Array"));
+        assertEquals(2, metadata.getValues("Custom Array").length);
+        assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
+        assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+
+        assertContains("Hello World!", content);
+    }
+
+    /**
+     * PDFs can be "protected" with the default password. This means
+     * they're encrypted (potentially both text and metadata),
+     * but we can decrypt them easily.
+     */
+    @Test
+    public void testProtectedPDF() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+        assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+        assertContains("On 16 November 2002", content);
+        assertContains("In many important respects", content);
+
+
+        // Try again with an explicit empty password
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "";
+            }
+        });
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        }
+        assertEquals("true", metadata.get("pdf:encrypted"));
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
+
+        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+        assertContains("On 16 November 2002", content);
+        assertContains("In many important respects", content);
+
+        //now test wrong password
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "WRONG!!!!";
+            }
+        });
+
+        boolean ex = false;
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        } catch (EncryptedDocumentException e) {
+            ex = true;
+        }
+        content = handler.toString();
+
+        assertTrue("encryption exception", ex);
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        //pdf:encrypted, X-Parsed-By and Content-Type
+        assertEquals("very little metadata should be parsed", 3, metadata.names().length);
+        assertEquals(0, content.length());
+
+        //now test wrong password with non sequential parser
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "WRONG!!!!";
+            }
+        });
+        PDFParserConfig config = new PDFParserConfig();
+        config.setUseNonSequentialParser(true);
+        context.set(PDFParserConfig.class, config);
+
+        ;
+        ex = false;
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        } catch (EncryptedDocumentException e) {
+            ex = true;
+        }
+        content = handler.toString();
+        assertTrue("encryption exception", ex);
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("pdf:encrypted"));
+
+        //pdf:encrypted, X-Parsed-By and Content-Type
+        assertEquals("very little metadata should be parsed", 3, metadata.names().length);
+        assertEquals(0, content.length());
+    }
+
+    @Test
+    public void testTwoTextBoxes() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTwoTextBoxes.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFVarious.pdf");
+
+        String content = getText(stream, parser, metadata);
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            assertContains(row + ") Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        // TODO: I saved the word doc as a PDF, but that
+        // process somehow, apparently lost the gothic
+        // chars, so we cannot test this here:
+        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    @Test
+    public void testAnnotations() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertContains("Here is a comment", content);
+
+        // Test w/ annotation text disabled:
+        PDFParser pdfParser = new PDFParser();
+        pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
+        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        content = getText(stream, pdfParser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+        // annotation text disabled through parsecontext
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractAnnotationText(false);
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+
+        // TIKA-738: make sure no extra </p> tags
+        String xml = getXML("testAnnotations.pdf").xml;
+        assertEquals(substringCount("<p>", xml),
+                substringCount("</p>", xml));
+    }
+
+    // TIKA-981
+    @Test
+    public void testPopupAnnotation() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
+        String content = getText(stream, parser);
+        assertContains("this is the note", content);
+        assertContains("igalsh", content);
+    }
+
+    @Test
+    public void testEmbeddedPDFs() throws Exception {
+        String xml = getXML("testPDFPackage.pdf").xml;
+        assertContains("PDF1", xml);
+        assertContains("PDF2", xml);
+    }
+
+    @Test
+    public void testPageNumber() throws Exception {
+        final XMLResult result = getXML("testPageNumber.pdf");
+        final String content = result.xml.replaceAll("\\s+", "");
+        assertContains("<p>1</p>", content);
+    }
+
+    /**
+     * Test to ensure that Links are extracted from the text
+     * <p/>
+     * Note - the PDF contains the text "This is a hyperlink" which
+     * a hyperlink annotation, linking to the tika site, on it. This
+     * test will need updating when we're able to apply the annotation
+     * to the text itself, rather than following on afterwards as now
+     */
+    @Test
+    public void testLinks() throws Exception {
+        final XMLResult result = getXML("testPDFVarious.pdf");
+        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
+    }
+
+    @Test
+    public void testDisableAutoSpace() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+        parser.getPDFParserConfig().setEnableAutoSpace(true);
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        //default is true
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        config.setEnableAutoSpace(false);
+
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+    }
+
+    @Test
+    public void testDuplicateOverlappingText() throws Exception {
+        PDFParser parser = new PDFParser();
+        InputStream stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        String content = getText(stream, parser);
+        assertContains("Text the first timeText the second time", content);
+
+        parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
+        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        content = getText(stream, parser);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        content = getText(stream, autoParser, context);
+        assertContains("Text the first timeText the second time", content);
+
+        config.setSuppressDuplicateOverlappingText(true);
+        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        content = getText(stream, autoParser, context);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+
+    }
+
+    @Test
+    public void testSortByPosition() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        InputStream stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        String content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
+
+        parser.getPDFParserConfig().setSortByPosition(true);
+        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
+
+        //now try setting autodetect via parsecontext        
+        AutoDetectParser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
+
+        config.setSortByPosition(true);
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
+
+    }
+
+    // TIKA-1035
+    @Test
+    public void testBookmarks() throws Exception {
+        String xml = getXML("testPDF_bookmarks.pdf").xml;
+        int i = xml.indexOf("Denmark bookmark is here");
+        int j = xml.indexOf("</body>");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(i < j);
+    }
+
+    //TIKA-1124
+    @Test
+    public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
+       /* format of test doc:
+         docx/
+            pdf/
+               docx
+       */
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        String content = "";
+        InputStream stream = null;
+        try {
+            context.set(org.apache.tika.parser.Parser.class, parser);
+            stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
+            parser.parse(stream, handler, metadata, context);
+            content = handler.toString();
+        } finally {
+            stream.close();
+        }
+        int outerHaystack = content.indexOf("Outer_haystack");
+        int pdfHaystack = content.indexOf("pdf_haystack");
+        int needle = content.indexOf("Needle");
+        assertTrue(outerHaystack > -1);
+        assertTrue(pdfHaystack > -1);
+        assertTrue(needle > -1);
+        assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+        TrackingHandler tracker = new TrackingHandler();
+        TikaInputStream tis;
+        ContainerExtractor ex = new ParserContainerExtractor();
+        try {
+            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
+            ex.extract(tis, ex, tracker);
+        } finally {
+            stream.close();
+        }
+        assertEquals(true, ex.isSupported(tis));
+        assertEquals(3, tracker.filenames.size());
+        assertEquals(3, tracker.mediaTypes.size());
+        assertEquals("image1.emf", tracker.filenames.get(0));
+        assertNull(tracker.filenames.get(1));
+        assertEquals("Test.docx", tracker.filenames.get(2));
+        assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+        assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+        assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+    }
+
+    /**
+     * tests for equality between traditional sequential parser
+     * and newer nonsequential parser.
+     * <p/>
+     * TODO: more testing
+     */
+    @Test
+    public void testSequentialParser() throws Exception {
+
+        Parser sequentialParser = new AutoDetectParser();
+        Parser nonSequentialParser = new AutoDetectParser();
+
+        ParseContext seqContext = new ParseContext();
+        PDFParserConfig seqConfig = new PDFParserConfig();
+        seqConfig.setUseNonSequentialParser(false);
+        seqContext.set(PDFParserConfig.class, seqConfig);
+
+        ParseContext nonSeqContext = new ParseContext();
+        PDFParserConfig nonSeqConfig = new PDFParserConfig();
+        nonSeqConfig.setUseNonSequentialParser(true);
+        nonSeqContext.set(PDFParserConfig.class, nonSeqConfig);
+
+        File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
+        int pdfs = 0;
+        Set<String> knownMetadataDiffs = new HashSet<String>();
+        //PDFBox-1792/Tika-1203
+        knownMetadataDiffs.add("testAnnotations.pdf");
+        // Added for TIKA-93.
+        knownMetadataDiffs.add("testOCR.pdf");
+        // Added for TIKA-1085
+        knownMetadataDiffs.add("testPDF_bom.pdf");
+
+        //empty for now
+        Set<String> knownContentDiffs = new HashSet<String>();
+
+        for (File f : testDocs.listFiles()) {
+            if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
+                continue;
+            }
+
+            String sequentialContent = null;
+            Metadata sequentialMetadata = new Metadata();
+            try {
+                sequentialContent = getText(new FileInputStream(f),
+                        sequentialParser, seqContext, sequentialMetadata);
+            } catch (EncryptedDocumentException e) {
+                //silently skip a file that requires a user password
+                continue;
+            } catch (Exception e) {
+                throw new TikaException("Sequential Parser failed on test file " + f, e);
+            }
+
+            pdfs++;
+
+            String nonSequentialContent = null;
+            Metadata nonSequentialMetadata = new Metadata();
+            try {
+                nonSequentialContent = getText(new FileInputStream(f),
+                        nonSequentialParser, nonSeqContext, nonSequentialMetadata);
+            } catch (Exception e) {
+                throw new TikaException("Non-Sequential Parser failed on test file " + f, e);
+            }
+
+            if (knownContentDiffs.contains(f.getName())) {
+                assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent));
+            } else {
+                assertEquals(f.getName(), sequentialContent, nonSequentialContent);
+            }
+
+            //skip this one file.
+            if (knownMetadataDiffs.contains(f.getName())) {
+                assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata));
+            } else {
+                assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata);
+            }
+        }
+        //make sure nothing went wrong with getting the resource to test-documents
+        //must have tested >= 15 pdfs
+        boolean ge15 = (pdfs >= 15);
+        assertTrue("Number of pdf files tested >= 15 in non-sequential parser test", ge15);
+    }
+
+
+    // TIKA-973
+    //commented out until test documents that are unambiguously
+    //consistent with Apache License v2.0 are contributed.
+    //TODO: add back test for AcroForm extraction; test document should include
+    //recursive forms
+/*    public void testAcroForm() throws Exception{
+       Parser p = new AutoDetectParser();
+       ParseContext context = new ParseContext();
+       InputStream stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       String txt = getText(stream, p, context);
+       stream.close();
+
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       assertContains("123 Main St.", txt);
+       
+       
+       //now test with nonsequential parser
+       PDFParserConfig config = new PDFParserConfig();
+       config.setUseNonSequentialParser(true);
+       context.set(PDFParserConfig.class, config);
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       assertContains("123 Main St.", txt);
+       stream.close();     
+    }
+*/
+
+    //TIKA-1226
+    @Test
+    public void testSignatureInAcroForm() throws Exception {
+        //The current test doc does not contain any content in the signature area.
+        //This just tests that a RuntimeException is not thrown.
+        //TODO: find a better test file for this issue.
+        String xml = getXML("/testPDF_acroform3.pdf").xml;
+        assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
+    }
+
+    @Test // TIKA-1228, TIKA-1268
+    public void testEmbeddedFilesInChildren() throws Exception {
+        String xml = getXML("/testPDF_childAttachments.pdf").xml;
+        //"regressiveness" exists only in Unit10.doc not in the container pdf document
+        assertTrue(xml.contains("regressiveness"));
+
+        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, p);
+
+        try (TikaInputStream tis = TikaInputStream.get(
+                getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
+            p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
+        }
+
+        List<Metadata> metadatas = p.getMetadata();
+
+        assertEquals(5, metadatas.size());
+        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
+        assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
+    }
+
+
+    @Test
+    public void testEmbeddedFilesInAnnotations() throws Exception {
+        String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+
+        assertTrue(xml.contains("This is a Excel"));
+    }
+
+    @Test
+    public void testSingleCloseDoc() throws Exception {
+        //TIKA-1341
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf");
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new EventCountingHandler();
+        p.parse(is, h, m, c);
+        assertEquals(1, ((EventCountingHandler) h).getEndDocument());
+    }
+
+    @Test
+    public void testVersions() throws Exception {
+
+        Map<String, String> dcFormat = new HashMap<String, String>();
+        dcFormat.put("4.x", "application/pdf; version=1.3");
+        dcFormat.put("5.x", "application/pdf; version=1.4");
+        dcFormat.put("6.x", "application/pdf; version=1.5");
+        dcFormat.put("7.x", "application/pdf; version=1.6");
+        dcFormat.put("8.x", "application/pdf; version=1.7");
+        dcFormat.put("9.x", "application/pdf; version=1.7");
+        dcFormat.put("10.x", "application/pdf; version=1.7");
+        dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7");
+
+        Map<String, String> pdfVersions = new HashMap<String, String>();
+        pdfVersions.put("4.x", "1.3");
+        pdfVersions.put("5.x", "1.4");
+        pdfVersions.put("6.x", "1.5");
+        pdfVersions.put("7.x", "1.6");
+        pdfVersions.put("8.x", "1.7");
+        pdfVersions.put("9.x", "1.7");
+        pdfVersions.put("10.x", "1.7");
+        pdfVersions.put("11.x.PDFA-1b", "1.7");
+
+        Map<String, String> pdfExtensionVersions = new HashMap<String, String>();
+        pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
+        pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
+        pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 8");
+
+        Parser p = new AutoDetectParser();
+        for (Map.Entry<String, String> e : dcFormat.entrySet()) {
+            String fName = "testPDF_Version." + e.getKey() + ".pdf";
+            InputStream is = PDFParserTest.class.getResourceAsStream(
+                    "/test-documents/" + fName);
+            Metadata m = new Metadata();
+            ContentHandler h = new BodyContentHandler();
+            ParseContext c = new ParseContext();
+            p.parse(is, h, m, c);
+            is.close();
+            boolean foundDC = false;
+            String[] vals = m.getValues("dc:format");
+            for (String v : vals) {
+                if (v.equals(e.getValue())) {
+                    foundDC = true;
+                }
+            }
+            assertTrue("dc:format ::" + e.getValue(), foundDC);
+            String extensionVersionTruth = pdfExtensionVersions.get(e.getKey());
+            if (extensionVersionTruth != null) {
+                assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth,
+                        extensionVersionTruth,
+                        m.get("pdf:PDFExtensionVersion"));
+            }
+            assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()),
+                    m.get("pdf:PDFVersion"));
+        }
+        //now test full 11.x
+        String fName = "testPDF_Version.11.x.PDFA-1b.pdf";
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/" + fName);
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        Set<String> versions = new HashSet<String>();
+        for (String fmt : m.getValues("dc:format")) {
+            versions.add(fmt);
+        }
+
+        for (String hit : new String[]{"application/pdf; version=1.7",
+                "application/pdf; version=\"A-1b\"",
+                "application/pdf; version=\"1.7 Adobe Extension Level 8\""
+        }) {
+            assertTrue(hit, versions.contains(hit));
+        }
+
+        assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B");
+        assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
+    }
+
+    @Test
+    public void testMultipleAuthors() throws Exception {
+        String fName = "testPDF_twoAuthors.pdf";
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/" + fName);
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+
+        String[] keys = new String[]{
+                "dc:creator",
+                "meta:author",
+                "creator",
+                "Author"
+        };
+
+        for (String k : keys) {
+            String[] vals = m.getValues(k);
+            assertEquals("number of authors == 2 for key: " + k, 2, vals.length);
+            Set<String> set = new HashSet<String>();
+            set.add(vals[0]);
+            set.add(vals[1]);
+            assertTrue("Sample Author 1", set.contains("Sample Author 1"));
+            assertTrue("Sample Author 2", set.contains("Sample Author 2"));
+        }
+    }
+
+    //STUB test for once TIKA-1295 is fixed
+    @Test
+    public void testMultipleTitles() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf");
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        //TODO: add other tests as part of TIKA-1295
+        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
+        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp?
+        //
+        assertEquals("Hello World", m.get("dc:title"));
+    }
+
+    @Test
+    public void testInlineSelector() throws Exception {
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+
+        Parser defaultParser = new AutoDetectParser();
+
+        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, p);
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler(-1);
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));
+
+        p.parse(stream, handler, metadata, context);
+
+        List<Metadata> metadatas = p.getMetadata();
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+
+        stream.close();
+        p.reset();
+
+        //now try turning off inline
+        stream = TikaInputStream.get(this.getClass().getResource(path));
+
+        context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
+        inline = 0;
+        attach = 0;
+        handler = new BodyContentHandler(-1);
+        metadata = new Metadata();
+        p.parse(stream, handler, metadata, context);
+
+        metadatas = p.getMetadata();
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+    }
+
+
+    @Test
+    public void testInlineConfig() throws Exception {
+
+        Parser defaultParser = new AutoDetectParser();
+        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.Parser.class, p);
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler(-1);
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));
+
+        p.parse(stream, handler, metadata, context);
+
+        List<Metadata> metadatas = p.getMetadata();
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+        stream.close();
+        p.reset();
+
+        //now try turning off inline
+        stream = TikaInputStream.get(this.getClass().getResource(path));
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        inline = 0;
+        attach = 0;
+        handler = new BodyContentHandler(-1);
+        metadata = new Metadata();
+        p.parse(stream, handler, metadata, context);
+
+        metadatas = p.getMetadata();
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+    }
+
+    @Test //TIKA-1376
+    public void testEmbeddedFileNameExtraction() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+        RecursiveParserWrapper p = new RecursiveParserWrapper(
+                new AutoDetectParser(),
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        c.set(org.apache.tika.parser.Parser.class, p);
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        List<Metadata> metadatas = p.getMetadata();
+        assertEquals("metadata size", 5, metadatas.size());
+        Metadata firstAttachment = metadatas.get(1);
+        assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
+    }
+
+    @Test //TIKA-1374
+    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+        RecursiveParserWrapper p = new RecursiveParserWrapper(
+                new AutoDetectParser(),
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        c.set(org.apache.tika.parser.Parser.class, p);
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        List<Metadata> metadatas = p.getMetadata();
+        assertEquals("metadata size", 5, metadatas.size());
+
+        assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestDos.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+    }
+
+    @Test //TIKA-1427
+    public void testEmbeddedFileMarkup() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.Parser.class, parser);
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new ToXMLContentHandler();
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = null;
+        try {
+            stream = TikaInputStream.get(this.getClass().getResource(path));
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+
+        String xml = handler.toString();
+        //regular attachment
+        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
+        //inline image
+        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);
+
+        //doc embedded inside an annotation
+        xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
+        assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
+    }
+
+    //Access checker tests
+
+    @Test
+    public void testLegacyAccessChecking() throws Exception {
+        //test that default behavior doesn't throw AccessPermissionException
+        for (String file : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            String xml = getXML(file).xml;
+            assertContains("Hello World", xml);
+        }
+
+        //now try with the user password
+        PasswordProvider provider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        ParseContext context = new ParseContext();
+        context.set(PasswordProvider.class, provider);
+        Parser parser = new AutoDetectParser();
+
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            InputStream stream = null;
+            try {
+                stream = TikaInputStream.get(this.getClass().getResource("/test-documents/" + path));
+                String text = getText(stream, parser, context);
+                assertContains("Hello World", text);
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    @Test
+    public void testAccessCheckingEmptyPassword() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        //test exception for empty password
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
+        }
+
+        config.setAccessChecker(new AccessChecker(true));
+        assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                parser, context, AccessPermissionException.class);
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_empty.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingUserPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+
+        //test bad passwords
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
+        }
+
+        //bad password is still a bad password
+        config.setAccessChecker(new AccessChecker(true));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
+        }
+
+        //now test documents that require this "user" password
+        assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                parser, context, AccessPermissionException.class);
+
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingOwnerPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(true));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "owner";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+        //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+
+        //really, with owner's password, all extraction is allowed
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+    }
+
+    @Test
+    public void testPDFEncodedStringsInXMP() throws Exception {
+        //TIKA-1678
+        XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
+        assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    private void assertException(String path, Parser parser, ParseContext context, Class expected) {
+        boolean noEx = false;
+        InputStream is = getResourceAsStream(path);
+        try {
+            String text = getText(is, parser, context);
+            noEx = true;
+        } catch (Exception e) {
+            assertEquals("Not the right exception: " + path, expected, e.getClass());
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+        assertFalse(path + " should have thrown exception", noEx);
+    }
+
+    /**
+     * Simple class to count end of document events.  If functionality is useful,
+     * move to org.apache.tika in src/test
+     */
+    private class EventCountingHandler extends ContentHandlerDecorator {
+        private int endDocument = 0;
+
+        @Override
+        public void endDocument() {
+            endDocument++;
+        }
+
+        public int getEndDocument() {
+            return endDocument;
+        }
+    }
+
+    private class AvoidInlineSelector implements DocumentSelector {
+
+        @Override
+        public boolean select(Metadata metadata) {
+            String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                return false;
+            }
+            return true;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/pom.xml Wed Jan  6 03:50:50 2016
@@ -0,0 +1,143 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-scientific-module</artifactId>
+  <name>Apache Tika Scientific Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <netcdf-java.version>4.5.5</netcdf-java.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-exec</artifactId>
+      <version>1.3</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.json-simple</groupId>
+      <artifactId>json-simple</artifactId>
+      <version>1.1.1</version>
+      <exclusions>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-utility</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.storage</groupId>
+      <artifactId>sis-netcdf</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-metadata</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <!-- edu.ucar dependencies -->
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>netcdf4</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>grib</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>cdm</artifactId>
+      <version>${netcdf-java.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jcl-over-slf4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>httpservices</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <!-- Upstream parser libraries -->
+    <dependency>
+      <groupId>net.sourceforge.jmatio</groupId>
+      <artifactId>jmatio</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <!-- Apache Commons CSV -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@see IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+    BEGIN("start"),
+    END("end"),
+    CONDITIONAL("conditional"),
+    CONFIDENCE("confidence"),
+    DISCOVERY_TECNIQUE("discoveryTechnique"),
+    GENERIC("generic"),
+    HISTORY_OF("historyOf"),
+    ID("id"),
+    ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+    POLARITY("polarity");
+
+    private String name;
+
+    CTAKESAnnotationProperty(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return name;
+    }
+}
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+import static org.apache.commons.io.output.NullOutputStream.NULL_OUTPUT_STREAM;
+
+/**
+ * Configuration for {@see CTAKESContentHandler}.
+ * 
+ * This class allows to enable cTAKES and set its parameters.
+ */
+public class CTAKESConfig implements Serializable {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1599741171775528923L;
+
+    // Path to XML descriptor for AnalysisEngine
+    private String aeDescriptorPath = "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+
+    // UMLS username
+    private String UMLSUser = "";
+
+    // UMLS password
+    private String UMLSPass = "";
+
+    // Enables formatted output
+    private boolean prettyPrint = true; 
+
+    // Type of cTAKES (UIMA) serializer
+    private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+
+    // OutputStream object used for CAS serialization
+    private OutputStream stream = NULL_OUTPUT_STREAM;
+
+    // Enables CAS serialization
+    private boolean serialize = false;
+
+    // Enables text analysis using cTAKES
+    private boolean text = true;
+
+    // List of metadata to analyze using cTAKES
+    private String[] metadata = null;
+
+    // List of annotation properties to add to metadata in addition to text covered by an annotation
+    private CTAKESAnnotationProperty[] annotationProps = null;
+
+    // Character used to separate the annotation properties into metadata
+    private char separatorChar = ':';
+
+    /**
+     * Default constructor.
+     */
+    public CTAKESConfig() {
+        init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * @param stream {@see InputStream} object used to read properties.
+     */
+    public CTAKESConfig(InputStream stream) {
+        init(stream);
+    }
+
+    private void init(InputStream stream) {
+        if (stream == null) {
+            return;
+        }
+        Properties props = new Properties();
+
+        try {
+            props.load(stream);
+        } catch (IOException e) {
+            // TODO warning
+        } finally {
+            if (stream != null) {
+                try {
+                    stream.close();
+                } catch (IOException ioe) {
+                    // TODO warning
+                }
+            }
+        }
+
+        setAeDescriptorPath(props.getProperty("aeDescriptorPath", getAeDescriptorPath()));
+        setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+        setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+        setText(Boolean.valueOf(props.getProperty("text", Boolean.toString(isText()))));
+        setMetadata(props.getProperty("metadata", getMetadataAsString()).split(","));
+        setAnnotationProps(props.getProperty("annotationProps", getAnnotationPropsAsString()).split(","));
+        setSeparatorChar(props.getProperty("separatorChar", Character.toString(getSeparatorChar())).charAt(0));
+    }
+
+    /**
+     * Returns the path to XML descriptor for AnalysisEngine.
+     * @return the path to XML descriptor for AnalysisEngine.
+     */
+    public String getAeDescriptorPath() {
+        return aeDescriptorPath;
+    }
+
+    /**
+     * Returns the UMLS username.
+     * @return the UMLS username.
+     */
+    public String getUMLSUser() {
+        return UMLSUser;
+    }
+
+    /**
+     * Returns the UMLS password.
+     * @return the UMLS password.
+     */
+    public String getUMLSPass() {
+        return UMLSPass;
+    }
+
+    /**
+     * Returns {@code true} if formatted output is enabled, {@code false} otherwise.
+     * @return {@code true} if formatted output is enabled, {@code false} otherwise.
+     */
+    public boolean isPrettyPrint() {
+        return prettyPrint;
+    }
+
+    /**
+     * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+     * @return the type of cTAKES serializer.
+     */
+    public CTAKESSerializer getSerializerType() {
+        return serializerType;
+    }
+
+    /**
+     * Returns an {@see OutputStream} object used write the CAS.
+     * @return {@see OutputStream} object used write the CAS.
+     */
+    public OutputStream getOutputStream() {
+        return stream;
+    }
+
+    /**
+     * Returns {@code true} if CAS serialization is enabled, {@code false} otherwise.
+     * @return {@code true} if CAS serialization output is enabled, {@code false} otherwise.
+     */
+    public boolean isSerialize() {
+        return serialize;
+    }
+
+    /**
+     * Returns {@code true} if content text analysis is enabled {@code false} otherwise.
+     * @return {@code true} if content text analysis is enabled {@code false} otherwise.
+     */
+    public boolean isText() {
+        return text;
+    }
+
+    /**
+     * Returns an array of metadata whose values will be analyzed using cTAKES.
+     * @return an array of metadata whose values will be analyzed using cTAKES.
+     */
+    public String[] getMetadata() {
+        return metadata;
+    }
+
+    /**
+     * Returns a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES.
+     * @return a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES.
+     */
+    public String getMetadataAsString() {
+        if (metadata == null) {
+            return "";
+        }
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < metadata.length; i++) {
+            sb.append(metadata[i]);
+            if (i < metadata.length-1) {
+                sb.append(",");
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     * @return an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     */
+    public CTAKESAnnotationProperty[] getAnnotationProps() {
+        return annotationProps;
+    }
+
+    /**
+     * Returns a string containing a comma-separated list of {@see CTAKESAnnotationProperty} names that will be included into cTAKES metadata.
+     * @return
+     */
+    public String getAnnotationPropsAsString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("coveredText");
+        if (annotationProps != null) {
+            for (CTAKESAnnotationProperty property : annotationProps) {
+                sb.append(separatorChar);
+                sb.append(property.getName());
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns the separator character used for annotation properties.
+     * @return the separator character used for annotation properties.
+     */
+    public char getSeparatorChar() {
+        return separatorChar;
+    }
+
+    /**
+     * Sets the path to XML descriptor for AnalysisEngine.
+     * @param aeDescriptorPath the path to XML descriptor for AnalysisEngine.
+     */
+    public void setAeDescriptorPath(String aeDescriptorPath) {
+        this.aeDescriptorPath = aeDescriptorPath;
+    }
+
+    /**
+     * Sets the UMLS username.
+     * @param uMLSUser the UMLS username.
+     */
+    public void setUMLSUser(String uMLSUser) {
+        this.UMLSUser = uMLSUser;
+    }
+
+    /**
+     * Sets the UMLS password.
+     * @param uMLSPass the UMLS password.
+     */
+    public void setUMLSPass(String uMLSPass) {
+        this.UMLSPass = uMLSPass;
+    }
+
+    /**
+     * Enables the formatted output for serializer.
+     * @param prettyPrint {@true} to enable formatted output, {@code false} otherwise.
+     */
+    public void setPrettyPrint(boolean prettyPrint) {
+        this.prettyPrint = prettyPrint;
+    }
+
+    /**
+     * Sets the type of cTAKES (UIMA) serializer used to write CAS. 
+     * @param serializerType the type of cTAKES serializer.
+     */
+    public void setSerializerType(CTAKESSerializer serializerType) {
+        this.serializerType = serializerType;
+    }
+
+    /**
+     * Sets the {@see OutputStream} object used to write the CAS.
+     * @param stream the {@see OutputStream} object used to write the CAS.
+     */
+    public void setOutputStream(OutputStream stream) {
+        this.stream = stream;
+    }
+
+    /**
+     * Enables CAS serialization.
+     * @param serialize {@true} to enable CAS serialization, {@code false} otherwise.
+     */
+    public void setSerialize(boolean serialize) {
+        this.serialize = serialize;
+    }
+
+    /**
+     * Enables content text analysis using cTAKES.
+     * @param text {@true} to enable content text analysis, {@code false} otherwise.
+     */
+    public void setText(boolean text) {
+        this.text = text;
+    }
+
+    /**
+     * Sets the metadata whose values will be analyzed using cTAKES.
+     * @param metadata the metadata whose values will be analyzed using cTAKES.
+     */
+    public void setMetadata(String[] metadata) {
+        this.metadata = metadata;
+    }
+
+    /**
+     * Sets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     */
+    public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) {
+        this.annotationProps = annotationProps;
+    }
+
+    /**
+     * ets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+     */
+    public void setAnnotationProps(String[] annotationProps) {
+        CTAKESAnnotationProperty[] properties = new CTAKESAnnotationProperty[annotationProps.length];
+        for (int i = 0; i < annotationProps.length; i++) {
+            properties[i] = CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+        }
+        setAnnotationProps(properties);
+    }
+
+    /**
+     * Sets the separator character used for annotation properties.
+     * @param separatorChar the separator character used for annotation properties.
+     */
+    public void setSeparatorChar(char separatorChar) {
+        this.separatorChar = separatorChar;
+    }
+}
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class used to extract biomedical information while parsing.
+ *
+ * <p>
+ * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
+ * that is a natural language processing system for extraction of information
+ * from electronic medical record clinical free-text.
+ * </p>
+ */
+public class CTAKESContentHandler extends ContentHandlerDecorator {
+	// Prefix used for metadata including cTAKES annotations
+	public static String CTAKES_META_PREFIX = "ctakes:";
+
+	// Configuration object for CTAKESContentHandler
+	private CTAKESConfig config = null;
+
+	// StringBuilder object used to build the clinical free-text for cTAKES
+	private StringBuilder sb = null;
+
+	// Metadata object used for cTAKES annotations
+	private Metadata metadata = null;
+
+	// UIMA Analysis Engine
+	private AnalysisEngine ae = null;
+
+	// JCas object for working with the CAS (Common Analysis System)
+	private JCas jcas = null;
+
+	/**
+	 * Creates a new {@see CTAKESContentHandler} for the given {@see
+	 * ContentHandler} and Metadata objects.
+	 * 
+	 * @param handler
+	 *            the {@see ContentHandler} object to be decorated.
+	 * @param metadata
+	 *            the {@see Metadata} object that will be populated using
+	 *            biomedical information extracted by cTAKES.
+	 * @param config
+	 *            the {@see CTAKESConfig} object used to configure the handler.
+	 */
+	public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
+			CTAKESConfig config) {
+		super(handler);
+		this.metadata = metadata;
+		this.config = config;
+		this.sb = new StringBuilder();
+	}
+
+	/**
+	 * Creates a new {@see CTAKESContentHandler} for the given {@see
+	 * ContentHandler} and Metadata objects.
+	 * 
+	 * @param handler
+	 *            the {@see ContentHandler} object to be decorated.
+	 * @param metadata
+	 *            the {@see Metadata} object that will be populated using
+	 *            biomedical information extracted by cTAKES.
+	 */
+	public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+		this(handler, metadata, new CTAKESConfig());
+	}
+
+	/**
+	 * Default constructor.
+	 */
+	public CTAKESContentHandler() {
+		this(new DefaultHandler(), new Metadata());
+	}
+
+	@Override
+	public void characters(char[] ch, int start, int length)
+			throws SAXException {
+		if (config.isText()) {
+			sb.append(ch, start, length);
+		}
+		super.characters(ch, start, length);
+	}
+
+	@Override
+    public void endDocument() throws SAXException {
+        try {
+            // create an Analysis Engine
+        	if (ae == null) {
+        		ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), config.getUMLSPass());
+        	}
+
+            // create a JCas, given an AE
+        	if (jcas == null) {
+        		jcas = CTAKESUtils.getJCas(ae);
+        	}
+
+            // get metadata to process
+            StringBuilder metaText = new StringBuilder();
+            String[] metadataToProcess = config.getMetadata();
+            if (metadataToProcess != null) {
+                for (String name : config.getMetadata()) {
+                    for (String value : metadata.getValues(name)) {
+                        metaText.append(value);
+                        metaText.append(System.lineSeparator());
+                    }
+                }
+            }
+
+            // analyze text
+            jcas.setDocumentText(metaText.toString() + sb.toString());
+            ae.process(jcas);
+
+            // add annotations to metadata
+            metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString());
+            CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps();
+            Collection<IdentifiedAnnotation> collection = JCasUtil.select(jcas, IdentifiedAnnotation.class);
+            Iterator<IdentifiedAnnotation> iterator = collection.iterator();
+            while (iterator.hasNext()) {
+                IdentifiedAnnotation annotation = iterator.next();
+                StringBuilder annotationBuilder = new StringBuilder();
+                annotationBuilder.append(annotation.getCoveredText());
+                if (annotationPros != null) {
+                    for (CTAKESAnnotationProperty property : annotationPros) {
+                        annotationBuilder.append(config.getSeparatorChar());
+                        annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, property));
+                    }
+                }
+                metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), annotationBuilder.toString());
+            }
+
+            if (config.isSerialize()) {
+                // serialize data
+                CTAKESUtils.serialize(jcas, config.getSerializerType(), config.isPrettyPrint(), config.getOutputStream());
+            }
+        } catch (Exception e) {
+            throw new SAXException(e.getMessage());
+        } finally {
+            CTAKESUtils.resetCAS(jcas);
+        }
+    }
+
+	/**
+	 * Returns metadata that includes cTAKES annotations.
+	 * 
+	 * @return {@Metadata} object that includes cTAKES annotations.
+	 */
+	public Metadata getMetadata() {
+		return metadata;
+	}
+}