You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/22 02:19:17 UTC

[05/13] tika git commit: TIKA-1855 -- first pass. Need to turn back on the forbidden-apis testCheck. More clean up remains.

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
deleted file mode 100644
index c3d13b7..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.mime;
-
-// Junit imports
-import static java.nio.charset.StandardCharsets.UTF_16BE;
-import static java.nio.charset.StandardCharsets.UTF_16LE;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * 
- * Test Suite for the {@link MimeTypes} repository.
- * 
- */
-public class TestMimeTypes {
-
-    private Tika tika;
-
-    private MimeTypes repo;
-
-    private URL u;
-
-    private static final File f = new File("/a/b/c/x.pdf");
-
-    @Before
-    public void setUp() throws Exception{
-        TikaConfig config = TikaConfig.getDefaultConfig();
-        repo = config.getMimeRepository();
-        tika = new Tika(config);
-        u = new URL("http://mydomain.com/x.pdf?x=y");
-    }
-
-    @Test
-    public void testCaseSensitivity() {
-        String type = tika.detect("test.PDF");
-        assertNotNull(type);
-        assertEquals(type, tika.detect("test.pdf"));
-        assertEquals(type, tika.detect("test.PdF"));
-        assertEquals(type, tika.detect("test.pdF"));
-    }
-
-    @Test
-    public void testLoadMimeTypes() throws MimeTypeException {
-        assertNotNull(repo.forName("application/octet-stream"));
-        assertNotNull(repo.forName("text/x-tex"));
-    }
-
-    /**
-     * Tests MIME type determination based solely on the URL's extension.
-     */
-    @Test
-    public void testGuessMimeTypes() throws Exception {
-        assertTypeByName("application/pdf", "x.pdf");
-        assertEquals("application/pdf", tika.detect(u.toExternalForm()));
-        assertEquals("application/pdf", tika.detect(f.getPath()));
-        assertTypeByName("text/plain", "x.txt");
-        assertTypeByName("text/html", "x.htm");
-        assertTypeByName("text/html", "x.html");
-        assertTypeByName("application/xhtml+xml", "x.xhtml");
-        assertTypeByName("application/xml", "x.xml");
-        assertTypeByName("application/zip", "x.zip");
-        assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
-        assertTypeByName("application/octet-stream", "x.unknown");
-
-        // Test for the MS Office media types and file extensions listed in
-        // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
-        assertTypeByName("application/msword", "x.doc");
-        assertTypeByName("application/msword", "x.dot");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
-        assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
-        assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
-        assertTypeByName("application/vnd.ms-excel", "x.xls");
-        assertTypeByName("application/vnd.ms-excel", "x.xlt");
-        assertTypeByName("application/vnd.ms-excel", "x.xla");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
-        assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
-        assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
-        assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
-        assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
-        assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
-        assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
-        assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
-        assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
-        assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
-        assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
-        assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
-        assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
-        assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
-    }
-
-    /**
-     * Note - detecting container formats by mime magic is very very
-     *  iffy, as we can't be sure where things will end up.
-     * People really ought to use the container aware detection...
-     */
-    @Test
-    public void testOLE2Detection() throws Exception {
-        // These have the properties block near the start, so our mime
-        //  magic will spot them
-        assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
-        
-        // This one quite legitimately doesn't have its properties block
-        //  as one of the first couple of entries
-        // As such, our mime magic can't figure it out...
-        assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
-        assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
-        
-        
-        // By name + data:
-        
-        // Those we got right to start with are fine
-        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
-        
-        // And the name lets us specialise the generic OOXML
-        //  ones to their actual type
-        assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
-        assertTypeByNameAndData("application/msword", "testWORD.doc");
-    }
-    
-    /**
-     * Files generated by Works 7.0 Spreadsheet application use the OLE2
-     * structure and resemble Excel files (they contain a "Workbook"). They are
-     * not Excel though. They are distinguished from Excel files with an
-     * additional top-level entry in below the root of the POI filesystem.
-     * 
-     * @throws Exception
-     */
-    @Test
-    public void testWorksSpreadsheetDetection() throws Exception {
-        assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
-                // with name-only, everything should be all right 
-                "application/x-tika-msworks-spreadsheet",
-                // this is possible due to MimeTypes guessing the type
-                // based on the WksSSWorkBook near the beginning of the
-                // file
-                "application/x-tika-msworks-spreadsheet",
-                // this is right, the magic-based detection works, there is
-                // no need for the name-based detection to refine it
-                "application/x-tika-msworks-spreadsheet");
-    }
-    
-    @Test
-    public void testStarOfficeDetection() throws Exception {
-        assertTypeDetection("testVORCalcTemplate.vor",
-                "application/x-staroffice-template",
-                "application/vnd.stardivision.calc",
-                "application/vnd.stardivision.calc");
-        assertTypeDetection("testVORDrawTemplate.vor",
-                "application/x-staroffice-template",
-                "application/vnd.stardivision.draw",
-                "application/vnd.stardivision.draw");
-        assertTypeDetection("testVORImpressTemplate.vor",
-                "application/x-staroffice-template",
-                "application/vnd.stardivision.impress",
-                "application/vnd.stardivision.impress");
-        assertTypeDetection("testVORWriterTemplate.vor",
-                "application/x-staroffice-template",
-                "application/vnd.stardivision.writer",
-                "application/vnd.stardivision.writer");
-        
-        assertTypeDetection("testStarOffice-5.2-calc.sdc",
-                "application/vnd.stardivision.calc",
-                "application/vnd.stardivision.calc",
-                "application/vnd.stardivision.calc");
-        assertTypeDetection("testStarOffice-5.2-draw.sda",
-                "application/vnd.stardivision.draw",
-                "application/vnd.stardivision.draw",
-                "application/vnd.stardivision.draw");
-        assertTypeDetection("testStarOffice-5.2-impress.sdd",
-                "application/vnd.stardivision.impress",
-                "application/vnd.stardivision.impress",
-                "application/vnd.stardivision.impress");
-        assertTypeDetection("testStarOffice-5.2-writer.sdw",
-                "application/vnd.stardivision.writer",
-                "application/vnd.stardivision.writer",
-                "application/vnd.stardivision.writer");
-    }
-    
-    /**
-     * Files generated by Works Word Processor versions 3.0 and 4.0 use the
-     * OLE2 structure. They don't resemble Word though.
-     * 
-     * @throws Exception
-     */
-    @Test
-    public void testOldWorksWordProcessorDetection() throws Exception {
-        assertTypeDetection(
-                "testWORKSWordProcessor3.0.wps",
-                // .wps is just like any other works extension
-                "application/vnd.ms-works",
-                // this is due to MatOST substring
-                "application/vnd.ms-works",
-                // magic-based detection works, no need to refine it
-                "application/vnd.ms-works");
-        
-        // files in version 4.0 are no different from those in version 3.0
-        assertTypeDetection(
-                "testWORKSWordProcessor4.0.wps",
-                "application/vnd.ms-works",
-                "application/vnd.ms-works",
-                "application/vnd.ms-works");
-    }
-    
-    /**
-     * Files from Excel 2 through 4 are based on the BIFF record
-     *  structure, but without a wrapping OLE2 structure.
-     * Excel 5 and Excel 95+ work on OLE2
-     */
-    @Test
-    public void testOldExcel() throws Exception {
-        // With just a name, we'll think everything's a new Excel file
-        assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
-        assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
-        assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
-        
-        // With data, we can work out if it's old or new style
-        assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
-        assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
-        assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
-        
-        assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
-        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
-        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
-    }
-    
-    /**
-     * Note - detecting container formats by mime magic is very very
-     *  iffy, as we can't be sure where things will end up.
-     * People really ought to use the container aware detection...
-     */
-    @Test
-    public void testOoxmlDetection() throws Exception {
-        // These two do luckily have [Content_Types].xml near the start,
-        //  so our mime magic will spot them
-        assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
-        assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
-        
-        // This one quite legitimately doesn't have its [Content_Types].xml
-        //  file as one of the first couple of entries
-        // As such, our mime magic can't figure it out...
-        assertTypeByData("application/zip", "testWORD.docx");
-        
-        // If we give the filename as well as the data, we can
-        //  specialise the ooxml generic one to the correct type
-        assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
-        assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
-        assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
-        
-        // Test a few of the less usual ones
-        assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
-        assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
-        assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
-        assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
-    }
-    
-    /**
-     * Note - container based formats, needs container detection
-     *  to be properly correct
-     */
-    @Test
-    public void testVisioDetection() throws Exception {
-        // By Name, should get it right
-        assertTypeByName("application/vnd.visio", "testVISIO.vsd");
-        assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
-        assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
-        assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
-        assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
-        assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
-        assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
-        
-        // By Name and Data, should get it right
-        assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
-        assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
-        assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
-        assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
-        assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
-        assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
-        assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
-        
-        // By Data only, will get the container parent
-        assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
-        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
-    }
-
-    /**
-     * Note - detecting container formats by mime magic is very very
-     *  iffy, as we can't be sure where things will end up.
-     * People really ought to use the container aware detection...
-     */
-    @Test
-    public void testIWorkDetection() throws Exception {
-        // By name is easy
-       assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
-       assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
-       assertTypeByName("application/vnd.apple.pages", "testPages.pages");
-       
-       // We can't do it by data, as we'd need to unpack
-       //  the zip file to check the XML 
-       assertTypeByData("application/zip", "testKeynote.key");
-       
-       assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
-       assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
-       assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
-    }
-    
-    @Test
-    public void testArchiveDetection() throws Exception {
-       assertTypeByName("application/x-archive", "test.ar");
-       assertTypeByName("application/zip",    "test.zip");
-       assertTypeByName("application/x-tar",  "test.tar");
-       assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
-       assertTypeByName("application/x-cpio", "test.cpio");
-       
-       // TODO Add an example .deb and .udeb, then check these
-       
-       // Check the mime magic patterns for them work too
-       assertTypeByData("application/x-archive", "testARofText.ar");
-       assertTypeByData("application/x-archive", "testARofSND.ar"); 
-       assertTypeByData("application/zip",    "test-documents.zip");
-       assertTypeByData("application/x-gtar",  "test-documents.tar"); // GNU TAR
-       assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
-       assertTypeByData("application/x-cpio", "test-documents.cpio");
-       
-       // For spanned zip files, the .zip file doesn't have the header, it's the other parts
-       assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
-       assertTypeByData("application/zip",          "test-documents-spanned.z01");
-    }
-    
-    @Test
-    public void testFeedsDetection() throws Exception {
-        assertType("application/rss+xml",  "rsstest.rss");
-        assertType("application/atom+xml", "testATOM.atom");
-        assertTypeByData("application/rss+xml",  "rsstest.rss");
-        assertTypeByName("application/rss+xml",  "rsstest.rss");
-        assertTypeByData("application/atom+xml", "testATOM.atom");
-        assertTypeByName("application/atom+xml", "testATOM.atom");
-    }
-    
-    @Test
-    public void testFitsDetection() throws Exception {
-        // FITS image created using imagemagick convert of testJPEG.jpg
-        assertType("application/fits", "testFITS.fits");
-        assertTypeByData("application/fits", "testFITS.fits");
-        assertTypeByName("application/fits", "testFITS.fits");
-    }
-
-    @Test
-    public void testJpegDetection() throws Exception {
-        assertType("image/jpeg", "testJPEG.jpg");
-        assertTypeByData("image/jpeg", "testJPEG.jpg");
-        assertTypeByName("image/jpeg", "x.jpg");
-        assertTypeByName("image/jpeg", "x.JPG");
-        assertTypeByName("image/jpeg", "x.jpeg");
-        assertTypeByName("image/jpeg", "x.JPEG");
-        assertTypeByName("image/jpeg", "x.jpe");
-        assertTypeByName("image/jpeg", "x.jif");
-        assertTypeByName("image/jpeg", "x.jfif");
-        assertTypeByName("image/jpeg", "x.jfi");
-        
-        assertType("image/jp2", "testJPEG.jp2");
-        assertTypeByData("image/jp2", "testJPEG.jp2");
-        assertTypeByName("image/jp2", "x.jp2");
-    }
-
-    @Test
-    public void testBpgDetection() throws Exception {
-        assertType("image/x-bpg", "testBPG.bpg");
-        assertTypeByData("image/x-bpg", "testBPG.bpg");
-        assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
-        assertTypeByName("image/x-bpg", "x.bpg");
-    }
-    
-    @Test
-    public void testTiffDetection() throws Exception {
-        assertType("image/tiff", "testTIFF.tif");
-        assertTypeByData("image/tiff", "testTIFF.tif");
-        assertTypeByName("image/tiff", "x.tiff");
-        assertTypeByName("image/tiff", "x.tif");
-        assertTypeByName("image/tiff", "x.TIF");
-    }
-
-    @Test
-    public void testGifDetection() throws Exception {
-        assertType("image/gif", "testGIF.gif");
-        assertTypeByData("image/gif", "testGIF.gif");
-        assertTypeByName("image/gif", "x.gif");
-        assertTypeByName("image/gif", "x.GIF");
-    }
-
-    @Test
-    public void testPngDetection() throws Exception {
-        assertType("image/png", "testPNG.png");
-        assertTypeByData("image/png", "testPNG.png");
-        assertTypeByName("image/png", "x.png");
-        assertTypeByName("image/png", "x.PNG");
-    }
-
-    @Test
-    public void testWEBPDetection() throws Exception {
-        assertType("image/webp", "testWEBP.webp");
-        assertTypeByData("image/webp", "testWEBP.webp");
-        assertTypeByName("image/webp", "x.webp");
-        assertTypeByName("image/webp", "x.WEBP");
-    }
-
-    @Test
-    public void testBmpDetection() throws Exception {
-        assertType("image/x-ms-bmp", "testBMP.bmp");
-        assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
-        assertTypeByName("image/x-ms-bmp", "x.bmp");
-        assertTypeByName("image/x-ms-bmp", "x.BMP");
-        assertTypeByName("image/x-ms-bmp", "x.dib");
-        assertTypeByName("image/x-ms-bmp", "x.DIB");
-        //false positive check -- contains part of BMP signature
-        assertType("text/plain", "testBMPfp.txt");
-    }
-
-    @Test
-    public void testPnmDetection() throws Exception {
-        assertType("image/x-portable-bitmap", "testPBM.pbm");
-        assertType("image/x-portable-graymap", "testPGM.pgm");
-        assertType("image/x-portable-pixmap", "testPPM.ppm");
-        assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
-        assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
-        assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
-        assertTypeByName("image/x-portable-anymap", "x.pnm");
-        assertTypeByName("image/x-portable-anymap", "x.PNM");
-        assertTypeByName("image/x-portable-bitmap", "x.pbm");
-        assertTypeByName("image/x-portable-bitmap", "x.PBM");
-        assertTypeByName("image/x-portable-graymap", "x.pgm");
-        assertTypeByName("image/x-portable-graymap", "x.PGM");
-        assertTypeByName("image/x-portable-pixmap", "x.ppm");
-        assertTypeByName("image/x-portable-pixmap", "x.PPM");
-    }
-
-    @Test
-    public void testPictDetection() throws Exception {
-        assertType("image/x-pict", "testPICT.pct");
-        assertTypeByData("image/x-pict", "testPICT.pct");
-        assertTypeByName("image/x-pict", "x.pic");
-        assertTypeByName("image/x-pict", "x.PCT");
-    }
-
-    @Test
-    public void testCgmDetection() throws Exception {
-        // TODO: Need a test image file
-        assertTypeByName("image/cgm", "x.cgm");
-        assertTypeByName("image/cgm", "x.CGM");
-    }
-
-    @Test
-    public void testRdfXmlDetection() throws Exception {
-        assertTypeByName("application/rdf+xml", "x.rdf");
-        assertTypeByName("application/rdf+xml", "x.owl");
-    }
-
-    @Test
-    public void testSvgDetection() throws Exception {
-        assertType("image/svg+xml", "testSVG.svg");
-        assertTypeByData("image/svg+xml", "testSVG.svg");
-        assertTypeByName("image/svg+xml", "x.svg");
-        assertTypeByName("image/svg+xml", "x.SVG");
-
-        // Should *.svgz be svg or gzip
-        assertType("application/gzip", "testSVG.svgz");
-        assertTypeByData("application/gzip", "testSVG.svgz");
-        assertTypeByName("image/svg+xml", "x.svgz");
-        assertTypeByName("image/svg+xml", "x.SVGZ");
-    }
-
-    @Test
-    public void testPdfDetection() throws Exception {
-        // PDF extension by name is enough
-        assertTypeByName("application/pdf", "x.pdf");
-        assertTypeByName("application/pdf", "x.PDF");
-
-        // For normal PDFs, can get by name or data or both
-        assertType("application/pdf", "testPDF.pdf");
-        assertTypeByData("application/pdf", "testPDF.pdf");
-
-        // PDF with a BoM works both ways too
-        assertType("application/pdf", "testPDF_bom.pdf");
-        assertTypeByData("application/pdf", "testPDF_bom.pdf");
-    }
-
-    @Test
-    public void testSwfDetection() throws Exception {
-        assertTypeByName("application/x-shockwave-flash", "x.swf");
-        assertTypeByName("application/x-shockwave-flash", "x.SWF");
-        assertTypeByName("application/x-shockwave-flash", "test1.swf");
-        assertTypeByName("application/x-shockwave-flash", "test2.swf");
-        assertTypeByName("application/x-shockwave-flash", "test3.swf");
-    }
-
-    @Test
-    public void testDwgDetection() throws Exception {
-        assertTypeByName("image/vnd.dwg", "x.dwg");
-        assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
-        assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
-        assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
-    }
-
-    @Test
-    public void testprtDetection() throws Exception {
-       assertTypeByName("application/x-prt", "x.prt");
-       assertTypeByData("application/x-prt", "testCADKEY.prt");
-   }
-    
-    /**
-     * Formats which are based on plain text
-     */
-    @Test
-    public void testTextBasedFormatsDetection() throws Exception {
-       assertTypeByName("text/plain", "testTXT.txt");
-       assertType(      "text/plain", "testTXT.txt");
-       
-       assertTypeByName("text/css", "testCSS.css");
-       assertType(      "text/css", "testCSS.css");
-       
-       assertTypeByName("text/csv", "testCSV.csv");
-       assertType(      "text/csv", "testCSV.csv");
-       
-       assertTypeByName("text/html", "testHTML.html");
-       assertType(      "text/html", "testHTML.html");
-       
-       assertTypeByName("application/javascript", "testJS.js");
-       assertType(      "application/javascript", "testJS.js");
-    }
-    
-    @Test
-    public void testJavaDetection() throws Exception {
-        // TODO Classloader doesn't seem to find the .class file in test-documents
-        //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
-        
-        // OSX Native Extension
-        assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
-    }
-
-    @Test
-    public void testXmlAndHtmlDetection() throws Exception {
-        assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
-                .getBytes(UTF_8));
-        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
-                .getBytes(UTF_16LE));
-        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
-                .getBytes(UTF_16BE));
-        assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
-                .getBytes(UTF_8));
-        assertTypeByData("text/html", "<html><body>HTML</body></html>"
-                .getBytes(UTF_8));
-        assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
-                .getBytes(UTF_8));
-    }
-
-    @Test
-    public void testWmfDetection() throws Exception {
-        assertTypeByName("application/x-msmetafile", "x.wmf");
-        assertTypeByData("application/x-msmetafile", "testWMF.wmf");
-        assertTypeByName("application/x-msmetafile", "x.WMF");
-
-        assertTypeByName("application/x-emf", "x.emf");
-        assertTypeByData("application/x-emf","testEMF.emf");
-        assertTypeByName("application/x-emf", "x.EMF");
-        // TODO: Need a test wmz file
-        assertTypeByName("application/x-ms-wmz", "x.wmz");
-        assertTypeByName("application/x-ms-wmz", "x.WMZ");
-        // TODO: Need a test emz file
-        assertTypeByName("application/gzip", "x.emz");
-        assertTypeByName("application/gzip", "x.EMZ");
-    }
-
-    @Test
-    public void testPsDetection() throws Exception {
-        // TODO: Need a test postscript file
-        assertTypeByName("application/postscript", "x.ps");
-        assertTypeByName("application/postscript", "x.PS");
-        assertTypeByName("application/postscript", "x.eps");
-        assertTypeByName("application/postscript", "x.epsf");
-        assertTypeByName("application/postscript", "x.epsi");
-    }
-    
-    @Test
-    public void testMicrosoftMultiMediaDetection() throws Exception {
-       assertTypeByName("video/x-ms-asf", "x.asf");
-       assertTypeByName("video/x-ms-wmv", "x.wmv");
-       assertTypeByName("audio/x-ms-wma", "x.wma");
-       
-       assertTypeByData("video/x-ms-asf", "testASF.asf");
-       assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
-       assertTypeByData("audio/x-ms-wma", "testWMA.wma");
-    }
-    
-    /**
-     * All 3 DITA types are in theory handled by the same mimetype,
-     *  but we specialise them 
-     */
-    @Test
-    public void testDITADetection() throws Exception {
-       assertTypeByName("application/dita+xml; format=topic", "test.dita");
-       assertTypeByName("application/dita+xml; format=map", "test.ditamap");
-       assertTypeByName("application/dita+xml; format=val", "test.ditaval");
-       
-       assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
-       assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
-       assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
-       
-       assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
-       assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
-       assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
-       
-       // These are all children of the official type
-       assertEquals("application/dita+xml", 
-             repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
-       assertEquals("application/dita+xml", 
-             repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
-       // Concept inherits from topic
-       assertEquals("application/dita+xml; format=topic", 
-             repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
-    }
-
-    /**
-     * @since TIKA-194
-     */
-    @Test
-    public void testJavaRegex() throws Exception{
-        MimeType testType = new MimeType(MediaType.parse("foo/bar"));
-        this.repo.add(testType);
-        assertNotNull(repo.forName("foo/bar"));
-        String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
-        this.repo.addPattern(testType, pattern, true);
-        String testFileName = "rtg_sst_grb_0.5.12345678";
-        assertEquals("foo/bar", tika.detect(testFileName));
-
-        MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
-        this.repo.add(testType2);
-        assertNotNull(repo.forName("foo/bar2"));
-        this.repo.addPattern(testType2, pattern, false);
-        assertNotSame("foo/bar2", tika.detect(testFileName));
-    }
-    
-    @Test
-    public void testRawDetection() throws Exception {
-        assertTypeByName("image/x-raw-adobe", "x.dng");
-        assertTypeByName("image/x-raw-adobe", "x.DNG");
-        assertTypeByName("image/x-raw-hasselblad", "x.3fr");
-        assertTypeByName("image/x-raw-fuji", "x.raf");
-        assertTypeByName("image/x-raw-canon", "x.crw");
-        assertTypeByName("image/x-raw-canon", "x.cr2");
-        assertTypeByName("image/x-raw-kodak", "x.k25");
-        assertTypeByName("image/x-raw-kodak", "x.kdc");
-        assertTypeByName("image/x-raw-kodak", "x.dcs");
-        assertTypeByName("image/x-raw-kodak", "x.drf");
-        assertTypeByName("image/x-raw-minolta", "x.mrw");
-        assertTypeByName("image/x-raw-nikon", "x.nef");
-        assertTypeByName("image/x-raw-nikon", "x.nrw");
-        assertTypeByName("image/x-raw-olympus", "x.orf");
-        assertTypeByName("image/x-raw-pentax", "x.ptx");
-        assertTypeByName("image/x-raw-pentax", "x.pef");
-        assertTypeByName("image/x-raw-sony", "x.arw");
-        assertTypeByName("image/x-raw-sony", "x.srf");
-        assertTypeByName("image/x-raw-sony", "x.sr2");
-        assertTypeByName("image/x-raw-sigma", "x.x3f");
-        assertTypeByName("image/x-raw-epson", "x.erf");
-        assertTypeByName("image/x-raw-mamiya", "x.mef");
-        assertTypeByName("image/x-raw-leaf", "x.mos");
-        assertTypeByName("image/x-raw-panasonic", "x.raw");
-        assertTypeByName("image/x-raw-panasonic", "x.rw2");
-        assertTypeByName("image/x-raw-phaseone", "x.iiq");
-        assertTypeByName("image/x-raw-red", "x.r3d");
-        assertTypeByName("image/x-raw-imacon", "x.fff");
-        assertTypeByName("image/x-raw-logitech", "x.pxn");
-        assertTypeByName("image/x-raw-casio", "x.bay");
-        assertTypeByName("image/x-raw-rawzor", "x.rwz");
-    }
-    
-    /**
-     * Tests that we correctly detect the font types
-     */
-    @Test
-    public void testFontDetection() throws Exception {
-       assertTypeByName("application/x-font-adobe-metric", "x.afm");
-       assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
-       
-       assertTypeByName("application/x-font-printer-metric", "x.pfm");
-       // TODO Get a sample .pfm file
-       assertTypeByData(
-             "application/x-font-printer-metric", 
-             new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,  
-                         0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
-       );
-       
-       assertTypeByName("application/x-font-type1", "x.pfa");
-       // TODO Get a sample .pfa file
-       assertTypeByData(
-             "application/x-font-type1", 
-             new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
-                         0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
-                         0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
-       );
-       
-       assertTypeByName("application/x-font-type1", "x.pfb");
-       // TODO Get a sample .pfm file
-       assertTypeByData(
-             "application/x-font-type1", 
-             new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
-                          0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
-                          0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
-       );
-    }
-
-    /**
-     * Tests MimeTypes.getMimeType(URL), which examines both the byte header
-     * and, if necessary, the URL's extension.
-     */
-    @Test
-    public void testMimeDeterminationForTestDocuments() throws Exception {
-        assertType("text/html", "testHTML.html");
-        assertType("application/zip", "test-documents.zip");
-
-        assertType("text/html", "testHTML_utf8.html");
-        assertType(
-                "application/vnd.oasis.opendocument.text",
-                "testOpenOffice2.odt");
-        assertType("application/pdf", "testPDF.pdf");
-        assertType("application/rtf", "testRTF.rtf");
-        assertType("text/plain", "testTXT.txt");
-        assertType("application/xml", "testXML.xml");
-        assertType("audio/basic", "testAU.au");
-        assertType("audio/x-aiff", "testAIFF.aif");
-        assertType("audio/x-wav", "testWAV.wav");
-        assertType("audio/midi", "testMID.mid");
-        assertType("application/x-msaccess", "testACCESS.mdb");
-        assertType("application/x-font-ttf", "testTrueType3.ttf");
-    }
-    
-    @Test
-    public void test7ZipDetection() throws Exception {
-       assertTypeByName("application/x-7z-compressed","test-documents.7z");
-       assertTypeByData("application/x-7z-compressed","test-documents.7z");
-       assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
-   }
-
-    @Test
-    public void testWebArchiveDetection() throws Exception {
-        assertTypeByName("application/x-webarchive","x.webarchive");
-        assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
-        assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
-    }
-
-    /**
-     * KML, and KMZ (zipped KML)
-     */
-    @Test
-    public void testKMLZDetection() throws Exception {
-       assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
-       assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
-       assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
-       
-       assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
-       assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
-       
-       // By data only, mimetype magic only gets us to a .zip
-       // We need to use the Zip Aware detector to get the full type
-       assertTypeByData("application/zip","testKMZ.kmz");
-   }
-
-    @Test
-    public void testCreativeSuite() throws IOException {
-        assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
-        assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
-    }
-    
-    @Test
-    public void testAMR() throws IOException {
-        // AMR matches on name, data or both
-        assertTypeDetection("testAMR.amr", "audio/amr");
-        
-        // AMR-WB subtype shares extension, so needs data to identify
-        assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
-        
-        // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
-        //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
-    }
-    
-    @Test
-    public void testEmail() throws IOException {
-        // EMLX
-        assertTypeDetection("testEMLX.emlx", "message/x-emlx");
-        
-        // Groupwise
-        assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
-        
-        // Lotus
-        assertTypeDetection("testLotusEml.eml", "message/rfc822");
-        
-        // Thunderbird - doesn't currently work by name
-        assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
-    }
-    
-    @Test
-    public void testAxCrypt() throws Exception {
-        // test-TXT.txt encrypted with a key of "tika"
-        assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
-    }
-    
-    @Test
-    public void testWindowsEXE() throws Exception {
-        assertTypeByName("application/x-msdownload", "x.dll");
-        assertTypeByName("application/x-ms-installer", "x.msi");
-        assertTypeByName("application/x-dosexec", "x.exe");
-        
-        assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
-        assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
-        
-        // A jar file with part of a PE header, but not a full one
-        //  should still be detected as a zip or jar (without/with name)
-        assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
-        assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
-    }
-    
-    @Test
-    public void testMatroskaDetection() throws Exception {
-        assertType("video/x-matroska", "testMKV.mkv");
-        // TODO: Need custom detector data detection, see TIKA-1180
-        assertTypeByData("application/x-matroska", "testMKV.mkv");
-        assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
-        assertTypeByName("video/x-matroska", "x.mkv");
-        assertTypeByName("video/x-matroska", "x.MKV");
-        assertTypeByName("audio/x-matroska", "x.mka");
-        assertTypeByName("audio/x-matroska", "x.MKA");
-    }
-    
-    @Test
-    public void testWebMDetection() throws Exception {
-        assertType("video/webm", "testWEBM.webm");
-        // TODO: Need custom detector data detection, see TIKA-1180
-        assertTypeByData("application/x-matroska", "testWEBM.webm");
-        assertTypeByNameAndData("video/webm", "testWEBM.webm");
-        assertTypeByName("video/webm", "x.webm");
-        assertTypeByName("video/webm", "x.WEBM");
-    }
-
-    /** Test getMimeType(byte[]) */
-    @Test
-    public void testGetMimeType_byteArray() throws IOException {
-        // Plain text detection
-        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
-        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
-        assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
-        assertText(new byte[] { 'a', 'b', 'c' });
-        assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
-        assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
-    }
-    
-    @Test
-    public void testBerkeleyDB() throws IOException {
-        assertTypeByData(
-                "application/x-berkeley-db; format=btree; version=2", 
-                "testBDB_btree_2.db");
-        assertTypeByData(
-                "application/x-berkeley-db; format=btree; version=3", 
-                "testBDB_btree_3.db");
-        assertTypeByData(
-                "application/x-berkeley-db; format=btree; version=4", 
-                "testBDB_btree_4.db");
-        // V4 and V5 share the same btree format
-        assertTypeByData(
-                "application/x-berkeley-db; format=btree; version=4", 
-                "testBDB_btree_5.db");
-        
-        assertTypeByData(
-                "application/x-berkeley-db; format=hash; version=2", 
-                "testBDB_hash_2.db");
-        assertTypeByData(
-                "application/x-berkeley-db; format=hash; version=3", 
-                "testBDB_hash_3.db");
-        assertTypeByData(
-                "application/x-berkeley-db; format=hash; version=4", 
-                "testBDB_hash_4.db");
-        assertTypeByData(
-                "application/x-berkeley-db; format=hash; version=5", 
-                "testBDB_hash_5.db");
-    }
-    
-    /**
-     * CBOR typically contains HTML
-     */
-    @Test
-    public void testCBOR() throws IOException {
-        assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
-        assertTypeByData("application/cbor", "NUTCH-1997.cbor");
-    }
-    
-    @Test
-    public void testZLIB() throws IOException {
-        // ZLIB encoded versions of testTXT.txt
-        assertTypeByData("application/zlib", "testTXT.zlib");
-        assertTypeByData("application/zlib", "testTXT.zlib0");
-        assertTypeByData("application/zlib", "testTXT.zlib5");
-        assertTypeByData("application/zlib", "testTXT.zlib9");
-    }
-    
-    @Test
-    public void testTextFormats() throws Exception {
-        assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
-        assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
-    }
-    
-    @Test
-    public void testCodeFormats() throws Exception {
-        assertType("text/x-csrc", "testC.c");
-        assertType("text/x-chdr", "testH.h");
-        assertTypeByData("text/x-csrc", "testC.c");
-        assertTypeByData("text/x-chdr", "testH.h");
-        
-        assertTypeByName("text/x-java-source", "testJAVA.java");
-        assertType("text/x-java-properties", "testJAVAPROPS.properties");
-        
-        assertType("text/x-matlab", "testMATLAB.m");
-        assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
-        assertType("text/x-matlab", "testMATLAB_barcast.m");
-        assertTypeByData("text/x-matlab", "testMATLAB.m");
-        assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
-        assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
-    }
-
-    @Test
-    public void testWebVTT() throws Exception {
-        assertType("text/vtt", "testWebVTT.vtt");
-        assertTypeByData("text/vtt", "testWebVTT.vtt");
-    }
-    
-    private void assertText(byte[] prefix) throws IOException {
-        assertMagic("text/plain", prefix);
-    }
-
-    private void assertNotText(byte[] prefix) throws IOException {
-        assertMagic("application/octet-stream", prefix);
-    }
-
-    private void assertMagic(String expected, byte[] prefix) throws IOException {
-        MediaType type =
-                repo.detect(new ByteArrayInputStream(prefix), new Metadata());
-        assertNotNull(type);
-        assertEquals(expected, type.toString());
-    }
-
-    private void assertType(String expected, String filename) throws Exception {
-        try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
-                "/test-documents/" + filename)) {
-            assertNotNull("Test file not found: " + filename, stream);
-            Metadata metadata = new Metadata();
-            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
-            assertEquals(expected, repo.detect(stream, metadata).toString());
-        }
-    }
-
-    private void assertTypeByName(String expected, String filename)
-            throws IOException {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
-        assertEquals(expected, repo.detect(null, metadata).toString());
-    }
-
-    private void assertTypeByData(String expected, String filename)
-            throws IOException {
-        try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
-                "/test-documents/" + filename)) {
-            assertNotNull("Test file not found: " + filename, stream);
-            Metadata metadata = new Metadata();
-            assertEquals(expected, repo.detect(stream, metadata).toString());
-        }
-    }
-    
-    private void assertTypeByData(String expected, byte[] data)
-            throws IOException {
-        try (InputStream stream = new ByteArrayInputStream(data)) {
-            Metadata metadata = new Metadata();
-            assertEquals(expected, repo.detect(stream, metadata).toString());
-        }
-    }
-
-    private void assertTypeDetection(String filename, String type)
-            throws IOException {
-        assertTypeDetection(filename, type, type, type);
-    }
-
-    private void assertTypeDetection(String filename, String byName, String byData, 
-            String byNameAndData) throws IOException {
-        assertTypeByName(byName, filename);
-        assertTypeByData(byData, filename);
-        assertTypeByNameAndData(byNameAndData, filename);
-    }
-
-    private void assertTypeByNameAndData(String expected, String filename)
-        throws IOException {
-       assertEquals(expected, getTypeByNameAndData(filename).toString());
-    }
-
-    private MediaType getTypeByNameAndData(String filename) throws IOException {
-        try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
-                "/test-documents/" + filename)) {
-            assertNotNull("Test document not found: " + filename, stream);
-            Metadata metadata = new Metadata();
-            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
-            return repo.detect(stream, metadata);
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
deleted file mode 100644
index 91b054e..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.BodyContentHandler;
-import org.gagravarr.tika.FlacParser;
-import org.gagravarr.tika.OpusParser;
-import org.gagravarr.tika.VorbisParser;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class AutoDetectParserTest {
-    private TikaConfig tika = TikaConfig.getDefaultConfig();
-
-    // Easy to read constants for the MIME types:
-    private static final String RAW        = "application/octet-stream";
-    private static final String EXCEL      = "application/vnd.ms-excel";
-    private static final String HTML       = "text/html; charset=ISO-8859-1";
-    private static final String PDF        = "application/pdf";
-    private static final String POWERPOINT = "application/vnd.ms-powerpoint";
-    private static final String KEYNOTE    = "application/vnd.apple.keynote";
-    private static final String PAGES      = "application/vnd.apple.pages";
-    private static final String NUMBERS    = "application/vnd.apple.numbers";
-    private static final String CHM        = "application/vnd.ms-htmlhelp";
-    private static final String RTF        = "application/rtf";
-    private static final String PLAINTEXT  = "text/plain; charset=ISO-8859-1";
-    private static final String UTF8TEXT   = "text/plain; charset=UTF-8";
-    private static final String WORD       = "application/msword";
-    private static final String XML        = "application/xml";
-    private static final String RSS        = "application/rss+xml";
-    private static final String BMP        = "image/x-ms-bmp";
-    private static final String GIF        = "image/gif";
-    private static final String JPEG       = "image/jpeg";
-    private static final String PNG        = "image/png";
-    private static final String OGG_VORBIS = "audio/vorbis";
-    private static final String OGG_OPUS   = "audio/opus";
-    private static final String OGG_FLAC   = "audio/x-oggflac"; 
-    private static final String FLAC_NATIVE= "audio/x-flac";
-    private static final String OPENOFFICE
-            = "application/vnd.oasis.opendocument.text";
-
-
-    /**
-     * This is where a single test is done.
-     * @param tp the parameters encapsulated in a TestParams instance
-     * @throws IOException
-     */
-    private void assertAutoDetect(TestParams tp) throws Exception {
-        try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
-            if (input == null) {
-                fail("Could not open stream from specified resource: "
-                        + tp.resourceRealName);
-            }
-            Metadata metadata = new Metadata();
-            metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
-            metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
-            ContentHandler handler = new BodyContentHandler();
-            new AutoDetectParser(tika).parse(input, handler, metadata);
-
-            assertEquals("Bad content type: " + tp,
-                    tp.realType, metadata.get(Metadata.CONTENT_TYPE));
-
-            if (tp.expectedContentFragment != null) {
-                assertTrue("Expected content not found: " + tp,
-                        handler.toString().contains(tp.expectedContentFragment));
-            }
-        }
-    }
-
-    /**
-     * Convenience method -- its sole purpose of existence is to make the
-     * call to it more readable than it would be if a TestParams instance
-     * would need to be instantiated there.
-     *
-     * @param resourceRealName real name of resource
-     * @param resourceStatedName stated name -- will a bad name fool us?
-     * @param realType - the real MIME type
-     * @param statedType - stated MIME type - will a wrong one fool us?
-     * @param expectedContentFragment - something expected in the text
-     * @throws Exception
-     */
-    private void assertAutoDetect(String resourceRealName,
-                                  String resourceStatedName,
-                                  String realType,
-                                  String statedType,
-                                  String expectedContentFragment)
-            throws Exception {
-
-        assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
-                realType, statedType, expectedContentFragment));
-    }
-
-    private void assertAutoDetect(
-            String resource, String type, String content) throws Exception {
-
-        resource = "/test-documents/" + resource;
-
-        // TODO !!!!  The disabled tests below should work!
-        // The correct MIME type should be determined regardless of the
-        // stated type (ContentType hint) and the stated URL name.
-
-
-        // Try different combinations of correct and incorrect arguments:
-        final String wrongMimeType = RAW;
-        assertAutoDetect(resource, resource, type, type,          content);
-        assertAutoDetect(resource, resource, type, null,          content);
-        assertAutoDetect(resource, resource, type, wrongMimeType, content);
-
-        assertAutoDetect(resource, null, type, type,          content);
-        assertAutoDetect(resource, null, type, null,          content);
-        assertAutoDetect(resource, null, type, wrongMimeType, content);
-
-        final String badResource = "a.xyz";
-        assertAutoDetect(resource, badResource, type, type,          content);
-        assertAutoDetect(resource, badResource, type, null,          content);
-        assertAutoDetect(resource, badResource, type, wrongMimeType, content);
-    }
-
-    @Test
-    public void testKeynote() throws Exception {
-        assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
-    }
-
-    @Test
-    public void testPages() throws Exception {
-        assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
-    }
-
-    @Test
-    public void testNumbers() throws Exception {
-        assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
-    }
-
-    @Test
-    public void testChm() throws Exception {
-        assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
-    }
-
-    @Test
-    public void testEpub() throws Exception {
-        assertAutoDetect(
-                "testEPUB.epub", "application/epub+zip",
-                "The previous headings were subchapters");
-    }
-
-    @Test
-    public void testExcel() throws Exception {
-        assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
-    }
-
-    @Test
-    public void testHTML() throws Exception {
-        assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
-    }
-
-    @Test
-    public void testOpenOffice() throws Exception {
-        assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
-                "This is a sample Open Office document");
-    }
-
-    @Test
-    public void testPDF() throws Exception {
-        assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
-
-    }
-
-    @Test
-    public void testPowerpoint() throws Exception {
-        assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
-    }
-
-    @Test
-    public void testRdfXml() throws Exception {
-        assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
-    }
-
-    @Test
-    public void testRTF() throws Exception {
-        assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
-    }
-
-    @Test
-    public void testText() throws Exception {
-        assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
-    }
-    
-    @Test
-    public void testTextNonASCIIUTF8() throws Exception {
-        assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
-    }
-
-    @Test
-    public void testWord() throws Exception {
-        assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
-    }
-
-    @Test
-    public void testXML() throws Exception {
-        assertAutoDetect("testXML.xml", XML, "Lius");
-    }
-
-    @Test
-    public void testRss() throws Exception {
-        assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
-    }
-    
-    @Test
-    public void testImages() throws Exception {
-       assertAutoDetect("testBMP.bmp", BMP, null);
-       assertAutoDetect("testGIF.gif", GIF, null);
-       assertAutoDetect("testJPEG.jpg", JPEG, null);
-       assertAutoDetect("testPNG.png", PNG, null);
-   }
-
-    /**
-     * Make sure that zip bomb attacks are prevented.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
-     */
-    @Test
-    public void testZipBombPrevention() throws Exception {
-        try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
-                "/test-documents/TIKA-216.tgz")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler(-1);
-            new AutoDetectParser(tika).parse(tgz, handler, metadata);
-            fail("Zip bomb was not detected");
-        } catch (TikaException e) {
-            // expected
-        }
-    }
-
-    /**
-     * Make sure XML parse errors don't trigger ZIP bomb detection.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
-     */
-    @Test
-    public void testNoBombDetectedForInvalidXml() throws Exception {
-        // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        ZipOutputStream zos = new ZipOutputStream(baos);
-        for (int i = 1; i <= 10; i++) {
-            zos.putNextEntry(new ZipEntry(i + ".xml"));
-            zos.closeEntry();
-        }
-        zos.finish();
-        zos.close();
-        new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
-                new Metadata());
-    }
-
-    /**
-     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
-     *  have been correctly included, and are available
-     */
-    @SuppressWarnings("deprecation")
-    @Test
-    public void testOggFlacAudio() throws Exception {
-       // The three test files should all have similar test data
-       String[] testFiles = new String[] {
-             "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
-             "testOPUS.opus"
-       };
-       MediaType[] mediaTypes = new MediaType[] {
-               MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
-               MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
-       };
-       
-       // Check we can load the parsers, and they claim to do the right things
-       VorbisParser vParser = new VorbisParser();
-       assertNotNull("Parser not found for " + mediaTypes[0], 
-                     vParser.getSupportedTypes(new ParseContext()));
-       
-       FlacParser fParser = new FlacParser();
-       assertNotNull("Parser not found for " + mediaTypes[1], 
-                     fParser.getSupportedTypes(new ParseContext()));
-       assertNotNull("Parser not found for " + mediaTypes[2], 
-                     fParser.getSupportedTypes(new ParseContext()));
-       
-       OpusParser oParser = new OpusParser();
-       assertNotNull("Parser not found for " + mediaTypes[3], 
-                     oParser.getSupportedTypes(new ParseContext()));
-       
-       // Check we found the parser
-       CompositeParser parser = (CompositeParser)tika.getParser();
-       for (MediaType mt : mediaTypes) {
-          assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) );
-       }
-       
-       // Have each file parsed, and check
-       for (int i=0; i<testFiles.length; i++) {
-           String file = testFiles[i];
-           try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(
-                   "/test-documents/" + file)) {
-               if (input == null) {
-                   fail("Could not find test file " + file);
-               }
-               Metadata metadata = new Metadata();
-               ContentHandler handler = new BodyContentHandler();
-               new AutoDetectParser(tika).parse(input, handler, metadata);
-
-               assertEquals("Incorrect content type for " + file,
-                       mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
-
-               // Check some of the common metadata
-               // Old style metadata
-               assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
-               assertEquals("Test Title", metadata.get(Metadata.TITLE));
-               // New style metadata
-               assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
-               assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-
-               // Check some of the XMPDM metadata
-               if (!file.endsWith(".opus")) {
-                   assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
-               }
-               assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
-               assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
-               assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
-
-               // Check some of the text
-               String content = handler.toString();
-               assertTrue(content.contains("Test Title"));
-               assertTrue(content.contains("Test Artist"));
-           }
-       }
-    }
-    
-    /**
-     * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
-     * list of supported parsers.
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
-     */
-    @Test
-    public void testSpecificParserList() throws Exception {
-        AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
-        
-        InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
-        Metadata metadata = new Metadata();
-        parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
-        
-        assertEquals("value", metadata.get("MyParser"));
-    }
-
-    private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
-    
-    /**
-     * A test detector which always returns the type supported
-     *  by the test parser
-     */
-    @SuppressWarnings("serial")
-    private static class MyDetector implements Detector {
-        public MediaType detect(InputStream input, Metadata metadata) throws IOException {
-            return MY_MEDIA_TYPE;
-        }
-    }
-    
-    @SuppressWarnings("serial")
-    private static class MyParser extends AbstractParser {
-        public Set<MediaType> getSupportedTypes(ParseContext context) {
-            Set<MediaType> supportedTypes = new HashSet<MediaType>();
-            supportedTypes.add(MY_MEDIA_TYPE);
-            return supportedTypes;
-        }
-
-        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
-            metadata.add("MyParser", "value");
-        }
-
-    }
-    
-    /**
-     * Minimal class to encapsulate all parameters -- the main reason for
-     * its existence is to aid in debugging via its toString() method.
-     *
-     * Getters and setters intentionally not provided.
-     */
-    private static class TestParams {
-
-        public String resourceRealName;
-        public String resourceStatedName;
-        public String realType;
-        public String statedType;
-        public String expectedContentFragment;
-
-
-        private TestParams(String resourceRealName,
-                           String resourceStatedName,
-                           String realType,
-                           String statedType,
-                           String expectedContentFragment) {
-            this.resourceRealName = resourceRealName;
-            this.resourceStatedName = resourceStatedName;
-            this.realType = realType;
-            this.statedType = statedType;
-            this.expectedContentFragment = expectedContentFragment;
-        }
-
-
-        /**
-         * Produces a string like the following:
-         *
-         * <pre>
-         * Test parameters:
-         *   resourceRealName        = /test-documents/testEXCEL.xls
-         *   resourceStatedName      = null
-         *   realType                = application/vnd.ms-excel
-         *   statedType              = null
-         *   expectedContentFragment = Sample Excel Worksheet
-         * </pre>
-         */
-        public String toString() {
-            return "Test parameters:\n"
-                + "  resourceRealName        = " + resourceRealName + "\n"
-                + "  resourceStatedName      = " + resourceStatedName + "\n"
-                + "  realType                = " + realType + "\n"
-                + "  statedType              = " + statedType + "\n"
-                + "  expectedContentFragment = " + expectedContentFragment + "\n";
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
deleted file mode 100644
index 68edfc2..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.utils.CommonsDigester;
-import org.junit.Test;
-
-
-public class DigestingParserTest extends TikaTest {
-
-    private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
-            "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
-
-    private final int UNLIMITED = 1000000;//well, not really, but longer than input file
-    private final Parser p = new AutoDetectParser();
-
-    @Test
-    public void testBasic() throws Exception {
-        Map<CommonsDigester.DigestAlgorithm, String> expected =
-                new HashMap<CommonsDigester.DigestAlgorithm, String>();
-
-        expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
-        expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
-        expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
-        expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
-                                                            "82bc53764a0f1430d134ae3b70c32654");
-        expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
-                                                            "8b8a6923fdf251ddab72c6e4b5d54160" +
-                                                            "9db917ba4260d1767995a844d8d654df");
-        expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
-                                                            "da4c21f36b54d7acd06fcf68e974663b"+
-                                                            "fed1d256875be58d22beacf178154cc3"+
-                                                            "a1178cb73443deaa53aa0840324708bb");
-
-        //test each one
-        for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
-            Metadata m = new Metadata();
-            XMLResult xml = getXML("test_recursive_embedded.docx",
-                    new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
-            assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
-        }
-
-
-        //test comma separated
-        CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
-        Metadata m = new Metadata();
-        XMLResult xml = getXML("test_recursive_embedded.docx",
-                new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
-        for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
-                CommonsDigester.DigestAlgorithm.MD5,
-                CommonsDigester.DigestAlgorithm.SHA256,
-                CommonsDigester.DigestAlgorithm.SHA384,
-                CommonsDigester.DigestAlgorithm.SHA512}) {
-            assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
-        }
-
-        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
-        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
-
-    }
-
-    @Test
-    public void testLimitedRead() throws Exception {
-        CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
-        int limit = 100;
-        byte[] bytes = new byte[limit];
-        InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
-        is.read(bytes, 0, limit);
-        is.close();
-        Metadata m = new Metadata();
-        try {
-            XMLResult xml = getXML(TikaInputStream.get(bytes),
-                    new DigestingParser(p, new CommonsDigester(100, algo)), m);
-        } catch (TikaException e) {
-            //thrown because this is just a file fragment
-            assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
-                    e.getMessage());
-        }
-        String expectedMD5 = m.get(P+"MD5");
-
-        m = new Metadata();
-        XMLResult xml = getXML("test_recursive_embedded.docx",
-                new DigestingParser(p, new CommonsDigester(100, algo)), m);
-        assertEquals(expectedMD5, m.get(P+"MD5"));
-    }
-
-    @Test
-    public void testReset() throws Exception {
-        String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
-        Metadata m = new Metadata();
-        XMLResult xml = getXML("test_recursive_embedded.docx",
-                new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
-        assertEquals(expectedMD5, m.get(P+"MD5"));
-    }
-
-    @Test
-    public void testNegativeMaxMarkLength() throws Exception {
-        Metadata m = new Metadata();
-        boolean ex = false;
-        try {
-            XMLResult xml = getXML("test_recursive_embedded.docx",
-                    new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
-        } catch (IllegalArgumentException e) {
-            ex = true;
-        }
-        assertTrue("Exception not thrown", ex);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
deleted file mode 100644
index 2fcd1c3..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.io.Reader;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-
-public class ParsingReaderTest {
-
-    @Test
-    public void testPlainText() throws Exception {
-        String data = "test content";
-        InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
-        Reader reader = new ParsingReader(stream, "test.txt");
-        assertEquals('t', reader.read());
-        assertEquals('e', reader.read());
-        assertEquals('s', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals(' ', reader.read());
-        assertEquals('c', reader.read());
-        assertEquals('o', reader.read());
-        assertEquals('n', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals('e', reader.read());
-        assertEquals('n', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals('\n', reader.read());
-        assertEquals(-1, reader.read());
-        reader.close();
-        assertEquals(-1, stream.read());
-    }
-
-    @Test
-    public void testXML() throws Exception {
-        String data = "<p>test <span>content</span></p>";
-        InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
-        Reader reader = new ParsingReader(stream, "test.xml");
-        assertEquals(' ', (char) reader.read());
-        assertEquals('t', (char) reader.read());
-        assertEquals('e', (char) reader.read());
-        assertEquals('s', (char) reader.read());
-        assertEquals('t', (char) reader.read());
-        assertEquals(' ', (char) reader.read());
-        assertEquals(' ', (char) reader.read());
-        assertEquals('c', (char) reader.read());
-        assertEquals('o', (char) reader.read());
-        assertEquals('n', (char) reader.read());
-        assertEquals('t', (char) reader.read());
-        assertEquals('e', (char) reader.read());
-        assertEquals('n', (char) reader.read());
-        assertEquals('t', (char) reader.read());
-        assertEquals('\n', (char) reader.read());
-        assertEquals(-1, reader.read());
-        reader.close();
-        assertEquals(-1, stream.read());
-    }
-
-    /**
-     * Test case for TIKA-203
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
-     */
-    @Test
-    public void testMetadata() throws Exception {
-        Metadata metadata = new Metadata();
-        InputStream stream = ParsingReaderTest.class.getResourceAsStream(
-                "/test-documents/testEXCEL.xls");
-        try (Reader reader = new ParsingReader(
-                new AutoDetectParser(), stream, metadata, new ParseContext())) {
-            // Metadata should already be available
-            assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
-            // Check that the internal buffering isn't broken
-            assertEquals('F', (char) reader.read());
-            assertEquals('e', (char) reader.read());
-            assertEquals('u', (char) reader.read());
-            assertEquals('i', (char) reader.read());
-            assertEquals('l', (char) reader.read());
-            assertEquals('1', (char) reader.read());
-        }
-    }
-
-}