You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/22 02:19:23 UTC
[11/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
new file mode 100644
index 0000000..b852de0
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -0,0 +1,1044 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// Junit imports
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ *
+ * Test Suite for the {@link MimeTypes} repository.
+ *
+ */
+public class TestMimeTypes extends TikaTest {
+
+ private Tika tika;
+
+ private MimeTypes repo;
+
+ private URL u;
+
+ private static final File f = new File("/a/b/c/x.pdf");
+
+ @Before
+ public void setUp() throws Exception{
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ repo = config.getMimeRepository();
+ tika = new Tika(config);
+ u = new URL("http://mydomain.com/x.pdf?x=y");
+ }
+
+ @Test
+ public void testCaseSensitivity() {
+ String type = tika.detect("test.PDF");
+ assertNotNull(type);
+ assertEquals(type, tika.detect("test.pdf"));
+ assertEquals(type, tika.detect("test.PdF"));
+ assertEquals(type, tika.detect("test.pdF"));
+ }
+
+ @Test
+ public void testLoadMimeTypes() throws MimeTypeException {
+ assertNotNull(repo.forName("application/octet-stream"));
+ assertNotNull(repo.forName("text/x-tex"));
+ }
+
+ /**
+ * Tests MIME type determination based solely on the URL's extension.
+ */
+ @Test
+ public void testGuessMimeTypes() throws Exception {
+ assertTypeByName("application/pdf", "x.pdf");
+ assertEquals("application/pdf", tika.detect(u.toExternalForm()));
+ assertEquals("application/pdf", tika.detect(f.getPath()));
+ assertTypeByName("text/plain", "x.txt");
+ assertTypeByName("text/html", "x.htm");
+ assertTypeByName("text/html", "x.html");
+ assertTypeByName("application/xhtml+xml", "x.xhtml");
+ assertTypeByName("application/xml", "x.xml");
+ assertTypeByName("application/zip", "x.zip");
+ assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
+ assertTypeByName("application/octet-stream", "x.unknown");
+
+ // Test for the MS Office media types and file extensions listed in
+ // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
+ assertTypeByName("application/msword", "x.doc");
+ assertTypeByName("application/msword", "x.dot");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
+ assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
+ assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
+ assertTypeByName("application/vnd.ms-excel", "x.xls");
+ assertTypeByName("application/vnd.ms-excel", "x.xlt");
+ assertTypeByName("application/vnd.ms-excel", "x.xla");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
+ assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
+ assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
+ assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
+ assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
+ assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
+ assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
+ assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
+ assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testOLE2Detection() throws Exception {
+ // These have the properties block near the start, so our mime
+ // magic will spot them
+ assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
+
+ // This one quite legitimately doesn't have its properties block
+ // as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
+ assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
+
+
+ // By name + data:
+
+ // Those we got right to start with are fine
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
+
+ // And the name lets us specialise the generic OOXML
+ // ones to their actual type
+ assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
+ assertTypeByNameAndData("application/msword", "testWORD.doc");
+ }
+
+ /**
+ * Files generated by Works 7.0 Spreadsheet application use the OLE2
+ * structure and resemble Excel files (they contain a "Workbook"). They are
+ * not Excel though. They are distinguished from Excel files with an
+ * additional top-level entry in below the root of the POI filesystem.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testWorksSpreadsheetDetection() throws Exception {
+ assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
+ // with name-only, everything should be all right
+ "application/x-tika-msworks-spreadsheet",
+ // this is possible due to MimeTypes guessing the type
+ // based on the WksSSWorkBook near the beginning of the
+ // file
+ "application/x-tika-msworks-spreadsheet",
+ // this is right, the magic-based detection works, there is
+ // no need for the name-based detection to refine it
+ "application/x-tika-msworks-spreadsheet");
+ }
+
+ @Test
+ public void testStarOfficeDetection() throws Exception {
+ assertTypeDetection("testVORCalcTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertTypeDetection("testVORDrawTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertTypeDetection("testVORImpressTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertTypeDetection("testVORWriterTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+
+ assertTypeDetection("testStarOffice-5.2-calc.sdc",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertTypeDetection("testStarOffice-5.2-draw.sda",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertTypeDetection("testStarOffice-5.2-impress.sdd",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertTypeDetection("testStarOffice-5.2-writer.sdw",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+ }
+
+ /**
+ * Files generated by Works Word Processor versions 3.0 and 4.0 use the
+ * OLE2 structure. They don't resemble Word though.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testOldWorksWordProcessorDetection() throws Exception {
+ assertTypeDetection(
+ "testWORKSWordProcessor3.0.wps",
+ // .wps is just like any other works extension
+ "application/vnd.ms-works",
+ // this is due to MatOST substring
+ "application/vnd.ms-works",
+ // magic-based detection works, no need to refine it
+ "application/vnd.ms-works");
+
+ // files in version 4.0 are no different from those in version 3.0
+ assertTypeDetection(
+ "testWORKSWordProcessor4.0.wps",
+ "application/vnd.ms-works",
+ "application/vnd.ms-works",
+ "application/vnd.ms-works");
+ }
+
+ /**
+ * Files from Excel 2 through 4 are based on the BIFF record
+ * structure, but without a wrapping OLE2 structure.
+ * Excel 5 and Excel 95+ work on OLE2
+ */
+ @Test
+ public void testOldExcel() throws Exception {
+ // With just a name, we'll think everything's a new Excel file
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
+
+ // With data, we can work out if it's old or new style
+ assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
+
+ assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testOoxmlDetection() throws Exception {
+ // These two do luckily have [Content_Types].xml near the start,
+ // so our mime magic will spot them
+ assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
+ assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+
+ // This one quite legitimately doesn't have its [Content_Types].xml
+ // file as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/zip", "testWORD.docx");
+
+ // If we give the filename as well as the data, we can
+ // specialise the ooxml generic one to the correct type
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
+
+ // Test a few of the less usual ones
+ assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
+ }
+
+ /**
+ * Note - container based formats, needs container detection
+ * to be properly correct
+ */
+ @Test
+ public void testVisioDetection() throws Exception {
+ // By Name, should get it right
+ assertTypeByName("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
+
+ // By Name and Data, should get it right
+ assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
+
+ // By Data only, will get the container parent
+ assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testIWorkDetection() throws Exception {
+ // By name is easy
+ assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
+ assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
+ assertTypeByName("application/vnd.apple.pages", "testPages.pages");
+
+ // We can't do it by data, as we'd need to unpack
+ // the zip file to check the XML
+ assertTypeByData("application/zip", "testKeynote.key");
+
+ assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
+ assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
+ assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
+ }
+
+ @Test
+ public void testArchiveDetection() throws Exception {
+ assertTypeByName("application/x-archive", "test.ar");
+ assertTypeByName("application/zip", "test.zip");
+ assertTypeByName("application/x-tar", "test.tar");
+ assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
+ assertTypeByName("application/x-cpio", "test.cpio");
+
+ // TODO Add an example .deb and .udeb, then check these
+
+ // Check the mime magic patterns for them work too
+ assertTypeByData("application/x-archive", "testARofText.ar");
+ assertTypeByData("application/x-archive", "testARofSND.ar");
+ assertTypeByData("application/zip", "test-documents.zip");
+ assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
+ assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
+ assertTypeByData("application/x-cpio", "test-documents.cpio");
+
+ // For spanned zip files, the .zip file doesn't have the header, it's the other parts
+ assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
+ assertTypeByData("application/zip", "test-documents-spanned.z01");
+ }
+
+ @Test
+ public void testFeedsDetection() throws Exception {
+ assertType("application/rss+xml", "rsstest.rss");
+ assertType("application/atom+xml", "testATOM.atom");
+ assertTypeByData("application/rss+xml", "rsstest.rss");
+ assertTypeByName("application/rss+xml", "rsstest.rss");
+ assertTypeByData("application/atom+xml", "testATOM.atom");
+ assertTypeByName("application/atom+xml", "testATOM.atom");
+ }
+
+ @Test
+ public void testFitsDetection() throws Exception {
+ // FITS image created using imagemagick convert of testJPEG.jpg
+ assertType("application/fits", "testFITS.fits");
+ assertTypeByData("application/fits", "testFITS.fits");
+ assertTypeByName("application/fits", "testFITS.fits");
+ }
+
+ @Test
+ public void testJpegDetection() throws Exception {
+ assertType("image/jpeg", "testJPEG.jpg");
+ assertTypeByData("image/jpeg", "testJPEG.jpg");
+ assertTypeByName("image/jpeg", "x.jpg");
+ assertTypeByName("image/jpeg", "x.JPG");
+ assertTypeByName("image/jpeg", "x.jpeg");
+ assertTypeByName("image/jpeg", "x.JPEG");
+ assertTypeByName("image/jpeg", "x.jpe");
+ assertTypeByName("image/jpeg", "x.jif");
+ assertTypeByName("image/jpeg", "x.jfif");
+ assertTypeByName("image/jpeg", "x.jfi");
+
+ assertType("image/jp2", "testJPEG.jp2");
+ assertTypeByData("image/jp2", "testJPEG.jp2");
+ assertTypeByName("image/jp2", "x.jp2");
+ }
+
+ @Test
+ public void testBpgDetection() throws Exception {
+ assertType("image/x-bpg", "testBPG.bpg");
+ assertTypeByData("image/x-bpg", "testBPG.bpg");
+ assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
+ assertTypeByName("image/x-bpg", "x.bpg");
+ }
+
+ @Test
+ public void testTiffDetection() throws Exception {
+ assertType("image/tiff", "testTIFF.tif");
+ assertTypeByData("image/tiff", "testTIFF.tif");
+ assertTypeByName("image/tiff", "x.tiff");
+ assertTypeByName("image/tiff", "x.tif");
+ assertTypeByName("image/tiff", "x.TIF");
+ }
+
+ @Test
+ public void testGifDetection() throws Exception {
+ assertType("image/gif", "testGIF.gif");
+ assertTypeByData("image/gif", "testGIF.gif");
+ assertTypeByName("image/gif", "x.gif");
+ assertTypeByName("image/gif", "x.GIF");
+ }
+
+ @Test
+ public void testPngDetection() throws Exception {
+ assertType("image/png", "testPNG.png");
+ assertTypeByData("image/png", "testPNG.png");
+ assertTypeByName("image/png", "x.png");
+ assertTypeByName("image/png", "x.PNG");
+ }
+
+ @Test
+ public void testWEBPDetection() throws Exception {
+ assertType("image/webp", "testWEBP.webp");
+ assertTypeByData("image/webp", "testWEBP.webp");
+ assertTypeByName("image/webp", "x.webp");
+ assertTypeByName("image/webp", "x.WEBP");
+ }
+
+ @Test
+ public void testBmpDetection() throws Exception {
+ assertType("image/x-ms-bmp", "testBMP.bmp");
+ assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
+ assertTypeByName("image/x-ms-bmp", "x.bmp");
+ assertTypeByName("image/x-ms-bmp", "x.BMP");
+ assertTypeByName("image/x-ms-bmp", "x.dib");
+ assertTypeByName("image/x-ms-bmp", "x.DIB");
+ //false positive check -- contains part of BMP signature
+ assertType("text/plain", "testBMPfp.txt");
+ }
+
+ @Test
+ public void testPnmDetection() throws Exception {
+ assertType("image/x-portable-bitmap", "testPBM.pbm");
+ assertType("image/x-portable-graymap", "testPGM.pgm");
+ assertType("image/x-portable-pixmap", "testPPM.ppm");
+ assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
+ assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
+ assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
+ assertTypeByName("image/x-portable-anymap", "x.pnm");
+ assertTypeByName("image/x-portable-anymap", "x.PNM");
+ assertTypeByName("image/x-portable-bitmap", "x.pbm");
+ assertTypeByName("image/x-portable-bitmap", "x.PBM");
+ assertTypeByName("image/x-portable-graymap", "x.pgm");
+ assertTypeByName("image/x-portable-graymap", "x.PGM");
+ assertTypeByName("image/x-portable-pixmap", "x.ppm");
+ assertTypeByName("image/x-portable-pixmap", "x.PPM");
+ }
+
+ @Test
+ public void testPictDetection() throws Exception {
+ assertType("image/x-pict", "testPICT.pct");
+ assertTypeByData("image/x-pict", "testPICT.pct");
+ assertTypeByName("image/x-pict", "x.pic");
+ assertTypeByName("image/x-pict", "x.PCT");
+ }
+
+ @Test
+ public void testCgmDetection() throws Exception {
+ // TODO: Need a test image file
+ assertTypeByName("image/cgm", "x.cgm");
+ assertTypeByName("image/cgm", "x.CGM");
+ }
+
+ @Test
+ public void testRdfXmlDetection() throws Exception {
+ assertTypeByName("application/rdf+xml", "x.rdf");
+ assertTypeByName("application/rdf+xml", "x.owl");
+ }
+
+ @Test
+ public void testSvgDetection() throws Exception {
+ assertType("image/svg+xml", "testSVG.svg");
+ assertTypeByData("image/svg+xml", "testSVG.svg");
+ assertTypeByName("image/svg+xml", "x.svg");
+ assertTypeByName("image/svg+xml", "x.SVG");
+
+ // Should *.svgz be svg or gzip
+ assertType("application/gzip", "testSVG.svgz");
+ assertTypeByData("application/gzip", "testSVG.svgz");
+ assertTypeByName("image/svg+xml", "x.svgz");
+ assertTypeByName("image/svg+xml", "x.SVGZ");
+ }
+
+ @Test
+ public void testPdfDetection() throws Exception {
+ // PDF extension by name is enough
+ assertTypeByName("application/pdf", "x.pdf");
+ assertTypeByName("application/pdf", "x.PDF");
+
+ // For normal PDFs, can get by name or data or both
+ assertType("application/pdf", "testPDF.pdf");
+ assertTypeByData("application/pdf", "testPDF.pdf");
+
+ // PDF with a BoM works both ways too
+ assertType("application/pdf", "testPDF_bom.pdf");
+ assertTypeByData("application/pdf", "testPDF_bom.pdf");
+ }
+
+ @Test
+ public void testSwfDetection() throws Exception {
+ assertTypeByName("application/x-shockwave-flash", "x.swf");
+ assertTypeByName("application/x-shockwave-flash", "x.SWF");
+ assertTypeByName("application/x-shockwave-flash", "test1.swf");
+ assertTypeByName("application/x-shockwave-flash", "test2.swf");
+ assertTypeByName("application/x-shockwave-flash", "test3.swf");
+ }
+
+ @Test
+ public void testDwgDetection() throws Exception {
+ assertTypeByName("image/vnd.dwg", "x.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
+ }
+
+ @Test
+ public void testprtDetection() throws Exception {
+ assertTypeByName("application/x-prt", "x.prt");
+ assertTypeByData("application/x-prt", "testCADKEY.prt");
+ }
+
+ /**
+ * Formats which are based on plain text
+ */
+ @Test
+ public void testTextBasedFormatsDetection() throws Exception {
+ assertTypeByName("text/plain", "testTXT.txt");
+ assertType( "text/plain", "testTXT.txt");
+
+ assertTypeByName("text/css", "testCSS.css");
+ assertType( "text/css", "testCSS.css");
+
+ assertTypeByName("text/csv", "testCSV.csv");
+ assertType( "text/csv", "testCSV.csv");
+
+ assertTypeByName("text/html", "testHTML.html");
+ assertType( "text/html", "testHTML.html");
+
+ assertTypeByName("application/javascript", "testJS.js");
+ assertType( "application/javascript", "testJS.js");
+ }
+
+ @Test
+ public void testJavaDetection() throws Exception {
+ // TODO Classloader doesn't seem to find the .class file in test-documents
+ //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
+
+ // OSX Native Extension
+ assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
+ }
+
+ @Test
+ public void testXmlAndHtmlDetection() throws Exception {
+ assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
+ .getBytes(UTF_8));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes(UTF_16LE));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes(UTF_16BE));
+ assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
+ .getBytes(UTF_8));
+ assertTypeByData("text/html", "<html><body>HTML</body></html>"
+ .getBytes(UTF_8));
+ assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
+ .getBytes(UTF_8));
+ }
+
+ @Test
+ public void testWmfDetection() throws Exception {
+ assertTypeByName("application/x-msmetafile", "x.wmf");
+ assertTypeByData("application/x-msmetafile", "testWMF.wmf");
+ assertTypeByName("application/x-msmetafile", "x.WMF");
+
+ assertTypeByName("application/x-emf", "x.emf");
+ assertTypeByData("application/x-emf","testEMF.emf");
+ assertTypeByName("application/x-emf", "x.EMF");
+ // TODO: Need a test wmz file
+ assertTypeByName("application/x-ms-wmz", "x.wmz");
+ assertTypeByName("application/x-ms-wmz", "x.WMZ");
+ // TODO: Need a test emz file
+ assertTypeByName("application/gzip", "x.emz");
+ assertTypeByName("application/gzip", "x.EMZ");
+ }
+
+ @Test
+ public void testPsDetection() throws Exception {
+ // TODO: Need a test postscript file
+ assertTypeByName("application/postscript", "x.ps");
+ assertTypeByName("application/postscript", "x.PS");
+ assertTypeByName("application/postscript", "x.eps");
+ assertTypeByName("application/postscript", "x.epsf");
+ assertTypeByName("application/postscript", "x.epsi");
+ }
+
+ @Test
+ public void testMicrosoftMultiMediaDetection() throws Exception {
+ assertTypeByName("video/x-ms-asf", "x.asf");
+ assertTypeByName("video/x-ms-wmv", "x.wmv");
+ assertTypeByName("audio/x-ms-wma", "x.wma");
+
+ assertTypeByData("video/x-ms-asf", "testASF.asf");
+ assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
+ assertTypeByData("audio/x-ms-wma", "testWMA.wma");
+ }
+
+ /**
+ * All 3 DITA types are in theory handled by the same mimetype,
+ * but we specialise them
+ */
+ @Test
+ public void testDITADetection() throws Exception {
+ assertTypeByName("application/dita+xml; format=topic", "test.dita");
+ assertTypeByName("application/dita+xml; format=map", "test.ditamap");
+ assertTypeByName("application/dita+xml; format=val", "test.ditaval");
+
+ assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
+ assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
+ assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
+
+ assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
+ assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
+ assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
+
+ // These are all children of the official type
+ assertEquals("application/dita+xml",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
+ assertEquals("application/dita+xml",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
+ // Concept inherits from topic
+ assertEquals("application/dita+xml; format=topic",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
+ }
+
+ /**
+ * @since TIKA-194
+ */
+ @Test
+ public void testJavaRegex() throws Exception{
+ MimeType testType = new MimeType(MediaType.parse("foo/bar"));
+ this.repo.add(testType);
+ assertNotNull(repo.forName("foo/bar"));
+ String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
+ this.repo.addPattern(testType, pattern, true);
+ String testFileName = "rtg_sst_grb_0.5.12345678";
+ assertEquals("foo/bar", tika.detect(testFileName));
+
+ MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
+ this.repo.add(testType2);
+ assertNotNull(repo.forName("foo/bar2"));
+ this.repo.addPattern(testType2, pattern, false);
+ assertNotSame("foo/bar2", tika.detect(testFileName));
+ }
+
+ @Test
+ public void testRawDetection() throws Exception {
+ assertTypeByName("image/x-raw-adobe", "x.dng");
+ assertTypeByName("image/x-raw-adobe", "x.DNG");
+ assertTypeByName("image/x-raw-hasselblad", "x.3fr");
+ assertTypeByName("image/x-raw-fuji", "x.raf");
+ assertTypeByName("image/x-raw-canon", "x.crw");
+ assertTypeByName("image/x-raw-canon", "x.cr2");
+ assertTypeByName("image/x-raw-kodak", "x.k25");
+ assertTypeByName("image/x-raw-kodak", "x.kdc");
+ assertTypeByName("image/x-raw-kodak", "x.dcs");
+ assertTypeByName("image/x-raw-kodak", "x.drf");
+ assertTypeByName("image/x-raw-minolta", "x.mrw");
+ assertTypeByName("image/x-raw-nikon", "x.nef");
+ assertTypeByName("image/x-raw-nikon", "x.nrw");
+ assertTypeByName("image/x-raw-olympus", "x.orf");
+ assertTypeByName("image/x-raw-pentax", "x.ptx");
+ assertTypeByName("image/x-raw-pentax", "x.pef");
+ assertTypeByName("image/x-raw-sony", "x.arw");
+ assertTypeByName("image/x-raw-sony", "x.srf");
+ assertTypeByName("image/x-raw-sony", "x.sr2");
+ assertTypeByName("image/x-raw-sigma", "x.x3f");
+ assertTypeByName("image/x-raw-epson", "x.erf");
+ assertTypeByName("image/x-raw-mamiya", "x.mef");
+ assertTypeByName("image/x-raw-leaf", "x.mos");
+ assertTypeByName("image/x-raw-panasonic", "x.raw");
+ assertTypeByName("image/x-raw-panasonic", "x.rw2");
+ assertTypeByName("image/x-raw-phaseone", "x.iiq");
+ assertTypeByName("image/x-raw-red", "x.r3d");
+ assertTypeByName("image/x-raw-imacon", "x.fff");
+ assertTypeByName("image/x-raw-logitech", "x.pxn");
+ assertTypeByName("image/x-raw-casio", "x.bay");
+ assertTypeByName("image/x-raw-rawzor", "x.rwz");
+ }
+
+ /**
+ * Tests that we correctly detect the font types
+ */
+ @Test
+ public void testFontDetection() throws Exception {
+ assertTypeByName("application/x-font-adobe-metric", "x.afm");
+ assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
+
+ assertTypeByName("application/x-font-printer-metric", "x.pfm");
+ // TODO Get a sample .pfm file
+ assertTypeByData(
+ "application/x-font-printer-metric",
+ new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
+ 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
+ );
+
+ assertTypeByName("application/x-font-type1", "x.pfa");
+ // TODO Get a sample .pfa file
+ assertTypeByData(
+ "application/x-font-type1",
+ new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
+ 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
+ 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
+ );
+
+ assertTypeByName("application/x-font-type1", "x.pfb");
+ // TODO Get a sample .pfm file
+ assertTypeByData(
+ "application/x-font-type1",
+ new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
+ 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
+ 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
+ );
+ }
+
+ /**
+ * Tests MimeTypes.getMimeType(URL), which examines both the byte header
+ * and, if necessary, the URL's extension.
+ */
+ @Test
+ public void testMimeDeterminationForTestDocuments() throws Exception {
+ assertType("text/html", "testHTML.html");
+ assertType("application/zip", "test-documents.zip");
+
+ assertType("text/html", "testHTML_utf8.html");
+ assertType(
+ "application/vnd.oasis.opendocument.text",
+ "testOpenOffice2.odt");
+ assertType("application/pdf", "testPDF.pdf");
+ assertType("application/rtf", "testRTF.rtf");
+ assertType("text/plain", "testTXT.txt");
+ assertType("application/xml", "testXML.xml");
+ assertType("audio/basic", "testAU.au");
+ assertType("audio/x-aiff", "testAIFF.aif");
+ assertType("audio/x-wav", "testWAV.wav");
+ assertType("audio/midi", "testMID.mid");
+ assertType("application/x-msaccess", "testACCESS.mdb");
+ assertType("application/x-font-ttf", "testTrueType3.ttf");
+ }
+
+ @Test
+ public void test7ZipDetection() throws Exception {
+ assertTypeByName("application/x-7z-compressed","test-documents.7z");
+ assertTypeByData("application/x-7z-compressed","test-documents.7z");
+ assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
+ }
+
+ @Test
+ public void testWebArchiveDetection() throws Exception {
+ assertTypeByName("application/x-webarchive","x.webarchive");
+ assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
+ assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
+ }
+
+ /**
+ * KML, and KMZ (zipped KML)
+ */
+ @Test
+ public void testKMLZDetection() throws Exception {
+ assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
+ assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
+ assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
+
+ assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
+ assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
+
+ // By data only, mimetype magic only gets us to a .zip
+ // We need to use the Zip Aware detector to get the full type
+ assertTypeByData("application/zip","testKMZ.kmz");
+ }
+
+ @Test
+ public void testCreativeSuite() throws IOException {
+ assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
+ assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
+ }
+
+ @Test
+ public void testAMR() throws IOException {
+ // AMR matches on name, data or both
+ assertTypeDetection("testAMR.amr", "audio/amr");
+
+ // AMR-WB subtype shares extension, so needs data to identify
+ assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
+
+ // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
+ //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
+ }
+
+ @Test
+ public void testEmail() throws IOException {
+ // EMLX
+ assertTypeDetection("testEMLX.emlx", "message/x-emlx");
+
+ // Groupwise
+ assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
+
+ // Lotus
+ assertTypeDetection("testLotusEml.eml", "message/rfc822");
+
+ // Thunderbird - doesn't currently work by name
+ assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
+ }
+
+ @Test
+ public void testAxCrypt() throws Exception {
+ // test-TXT.txt encrypted with a key of "tika"
+ assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
+ }
+
+ @Test
+ public void testWindowsEXE() throws Exception {
+ assertTypeByName("application/x-msdownload", "x.dll");
+ assertTypeByName("application/x-ms-installer", "x.msi");
+ assertTypeByName("application/x-dosexec", "x.exe");
+
+ assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
+ assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
+
+ // A jar file with part of a PE header, but not a full one
+ // should still be detected as a zip or jar (without/with name)
+ assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
+ assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
+ }
+
+ @Test
+ public void testMatroskaDetection() throws Exception {
+ assertType("video/x-matroska", "testMKV.mkv");
+ // TODO: Need custom detector data detection, see TIKA-1180
+ assertTypeByData("application/x-matroska", "testMKV.mkv");
+ assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
+ assertTypeByName("video/x-matroska", "x.mkv");
+ assertTypeByName("video/x-matroska", "x.MKV");
+ assertTypeByName("audio/x-matroska", "x.mka");
+ assertTypeByName("audio/x-matroska", "x.MKA");
+ }
+
+ @Test
+ public void testWebMDetection() throws Exception {
+ assertType("video/webm", "testWEBM.webm");
+ // TODO: Need custom detector data detection, see TIKA-1180
+ assertTypeByData("application/x-matroska", "testWEBM.webm");
+ assertTypeByNameAndData("video/webm", "testWEBM.webm");
+ assertTypeByName("video/webm", "x.webm");
+ assertTypeByName("video/webm", "x.WEBM");
+ }
+
+ /** Test getMimeType(byte[]) */
+ @Test
+ public void testGetMimeType_byteArray() throws IOException {
+ // Plain text detection
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
+ assertText(new byte[] { 'a', 'b', 'c' });
+ assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+ assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+ }
+
+ @Test
+ public void testBerkeleyDB() throws IOException {
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=2",
+ "testBDB_btree_2.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=3",
+ "testBDB_btree_3.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=4",
+ "testBDB_btree_4.db");
+ // V4 and V5 share the same btree format
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=4",
+ "testBDB_btree_5.db");
+
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=2",
+ "testBDB_hash_2.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=3",
+ "testBDB_hash_3.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=4",
+ "testBDB_hash_4.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=5",
+ "testBDB_hash_5.db");
+ }
+
+ /**
+ * CBOR typically contains HTML
+ */
+ @Test
+ public void testCBOR() throws IOException {
+ assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
+ assertTypeByData("application/cbor", "NUTCH-1997.cbor");
+ }
+
+ @Test
+ public void testZLIB() throws IOException {
+ // ZLIB encoded versions of testTXT.txt
+ assertTypeByData("application/zlib", "testTXT.zlib");
+ assertTypeByData("application/zlib", "testTXT.zlib0");
+ assertTypeByData("application/zlib", "testTXT.zlib5");
+ assertTypeByData("application/zlib", "testTXT.zlib9");
+ }
+
+ @Test
+ public void testTextFormats() throws Exception {
+ assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
+ assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
+ }
+
+ @Test
+ public void testCodeFormats() throws Exception {
+ assertType("text/x-csrc", "testC.c");
+ assertType("text/x-chdr", "testH.h");
+ assertTypeByData("text/x-csrc", "testC.c");
+ assertTypeByData("text/x-chdr", "testH.h");
+
+ assertTypeByName("text/x-java-source", "testJAVA.java");
+ assertType("text/x-java-properties", "testJAVAPROPS.properties");
+
+ assertType("text/x-matlab", "testMATLAB.m");
+ assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
+ assertType("text/x-matlab", "testMATLAB_barcast.m");
+ assertTypeByData("text/x-matlab", "testMATLAB.m");
+ assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
+ assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
+ }
+
+ @Test
+ public void testWebVTT() throws Exception {
+ assertType("text/vtt", "testWebVTT.vtt");
+ assertTypeByData("text/vtt", "testWebVTT.vtt");
+ }
+
+ private void assertText(byte[] prefix) throws IOException {
+ assertMagic("text/plain", prefix);
+ }
+
+ private void assertNotText(byte[] prefix) throws IOException {
+ assertMagic("application/octet-stream", prefix);
+ }
+
+ private void assertMagic(String expected, byte[] prefix) throws IOException {
+ MediaType type =
+ repo.detect(new ByteArrayInputStream(prefix), new Metadata());
+ assertNotNull(type);
+ assertEquals(expected, type.toString());
+ }
+
+ private void assertType(String expected, String filename) throws Exception {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeByName(String expected, String filename)
+ throws IOException {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ assertEquals(expected, repo.detect(null, metadata).toString());
+ }
+
+ private void assertTypeByData(String expected, String filename)
+ throws IOException {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ Metadata metadata = new Metadata();
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeByData(String expected, byte[] data)
+ throws IOException {
+ try (InputStream stream = new ByteArrayInputStream(data)) {
+ Metadata metadata = new Metadata();
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeDetection(String filename, String type)
+ throws IOException {
+ assertTypeDetection(filename, type, type, type);
+ }
+
+ private void assertTypeDetection(String filename, String byName, String byData,
+ String byNameAndData) throws IOException {
+ assertTypeByName(byName, filename);
+ assertTypeByData(byData, filename);
+ assertTypeByNameAndData(byNameAndData, filename);
+ }
+
+ private void assertTypeByNameAndData(String expected, String filename)
+ throws IOException {
+ assertEquals(expected, getTypeByNameAndData(filename).toString());
+ }
+
+ private MediaType getTypeByNameAndData(String filename) throws IOException {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ assertNotNull("Test document not found: " + filename, stream);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ return repo.detect(stream, metadata);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
new file mode 100644
index 0000000..91b054e
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
+import org.gagravarr.tika.FlacParser;
+import org.gagravarr.tika.OpusParser;
+import org.gagravarr.tika.VorbisParser;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class AutoDetectParserTest {
+ private TikaConfig tika = TikaConfig.getDefaultConfig();
+
+ // Easy to read constants for the MIME types:
+ private static final String RAW = "application/octet-stream";
+ private static final String EXCEL = "application/vnd.ms-excel";
+ private static final String HTML = "text/html; charset=ISO-8859-1";
+ private static final String PDF = "application/pdf";
+ private static final String POWERPOINT = "application/vnd.ms-powerpoint";
+ private static final String KEYNOTE = "application/vnd.apple.keynote";
+ private static final String PAGES = "application/vnd.apple.pages";
+ private static final String NUMBERS = "application/vnd.apple.numbers";
+ private static final String CHM = "application/vnd.ms-htmlhelp";
+ private static final String RTF = "application/rtf";
+ private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
+ private static final String UTF8TEXT = "text/plain; charset=UTF-8";
+ private static final String WORD = "application/msword";
+ private static final String XML = "application/xml";
+ private static final String RSS = "application/rss+xml";
+ private static final String BMP = "image/x-ms-bmp";
+ private static final String GIF = "image/gif";
+ private static final String JPEG = "image/jpeg";
+ private static final String PNG = "image/png";
+ private static final String OGG_VORBIS = "audio/vorbis";
+ private static final String OGG_OPUS = "audio/opus";
+ private static final String OGG_FLAC = "audio/x-oggflac";
+ private static final String FLAC_NATIVE= "audio/x-flac";
+ private static final String OPENOFFICE
+ = "application/vnd.oasis.opendocument.text";
+
+
+ /**
+ * This is where a single test is done.
+ * @param tp the parameters encapsulated in a TestParams instance
+ * @throws IOException
+ */
+ private void assertAutoDetect(TestParams tp) throws Exception {
+ try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
+ if (input == null) {
+ fail("Could not open stream from specified resource: "
+ + tp.resourceRealName);
+ }
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
+ metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser(tika).parse(input, handler, metadata);
+
+ assertEquals("Bad content type: " + tp,
+ tp.realType, metadata.get(Metadata.CONTENT_TYPE));
+
+ if (tp.expectedContentFragment != null) {
+ assertTrue("Expected content not found: " + tp,
+ handler.toString().contains(tp.expectedContentFragment));
+ }
+ }
+ }
+
+ /**
+ * Convenience method -- its sole purpose of existence is to make the
+ * call to it more readable than it would be if a TestParams instance
+ * would need to be instantiated there.
+ *
+ * @param resourceRealName real name of resource
+ * @param resourceStatedName stated name -- will a bad name fool us?
+ * @param realType - the real MIME type
+ * @param statedType - stated MIME type - will a wrong one fool us?
+ * @param expectedContentFragment - something expected in the text
+ * @throws Exception
+ */
+ private void assertAutoDetect(String resourceRealName,
+ String resourceStatedName,
+ String realType,
+ String statedType,
+ String expectedContentFragment)
+ throws Exception {
+
+ assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
+ realType, statedType, expectedContentFragment));
+ }
+
+ private void assertAutoDetect(
+ String resource, String type, String content) throws Exception {
+
+ resource = "/test-documents/" + resource;
+
+ // TODO !!!! The disabled tests below should work!
+ // The correct MIME type should be determined regardless of the
+ // stated type (ContentType hint) and the stated URL name.
+
+
+ // Try different combinations of correct and incorrect arguments:
+ final String wrongMimeType = RAW;
+ assertAutoDetect(resource, resource, type, type, content);
+ assertAutoDetect(resource, resource, type, null, content);
+ assertAutoDetect(resource, resource, type, wrongMimeType, content);
+
+ assertAutoDetect(resource, null, type, type, content);
+ assertAutoDetect(resource, null, type, null, content);
+ assertAutoDetect(resource, null, type, wrongMimeType, content);
+
+ final String badResource = "a.xyz";
+ assertAutoDetect(resource, badResource, type, type, content);
+ assertAutoDetect(resource, badResource, type, null, content);
+ assertAutoDetect(resource, badResource, type, wrongMimeType, content);
+ }
+
+ @Test
+ public void testKeynote() throws Exception {
+ assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
+ }
+
+ @Test
+ public void testPages() throws Exception {
+ assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
+ }
+
+ @Test
+ public void testNumbers() throws Exception {
+ assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
+ }
+
+ @Test
+ public void testChm() throws Exception {
+ assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
+ }
+
+ @Test
+ public void testEpub() throws Exception {
+ assertAutoDetect(
+ "testEPUB.epub", "application/epub+zip",
+ "The previous headings were subchapters");
+ }
+
+ @Test
+ public void testExcel() throws Exception {
+ assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
+ }
+
+ @Test
+ public void testHTML() throws Exception {
+ assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
+ }
+
+ @Test
+ public void testOpenOffice() throws Exception {
+ assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
+ "This is a sample Open Office document");
+ }
+
+ @Test
+ public void testPDF() throws Exception {
+ assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
+
+ }
+
+ @Test
+ public void testPowerpoint() throws Exception {
+ assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
+ }
+
+ @Test
+ public void testRdfXml() throws Exception {
+ assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
+ }
+
+ @Test
+ public void testRTF() throws Exception {
+ assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
+ }
+
+ @Test
+ public void testText() throws Exception {
+ assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
+ }
+
+ @Test
+ public void testTextNonASCIIUTF8() throws Exception {
+ assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
+ }
+
+ @Test
+ public void testWord() throws Exception {
+ assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
+ }
+
+ @Test
+ public void testXML() throws Exception {
+ assertAutoDetect("testXML.xml", XML, "Lius");
+ }
+
+ @Test
+ public void testRss() throws Exception {
+ assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
+ }
+
+ @Test
+ public void testImages() throws Exception {
+ assertAutoDetect("testBMP.bmp", BMP, null);
+ assertAutoDetect("testGIF.gif", GIF, null);
+ assertAutoDetect("testJPEG.jpg", JPEG, null);
+ assertAutoDetect("testPNG.png", PNG, null);
+ }
+
+ /**
+ * Make sure that zip bomb attacks are prevented.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
+ */
+ @Test
+ public void testZipBombPrevention() throws Exception {
+ try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
+ "/test-documents/TIKA-216.tgz")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler(-1);
+ new AutoDetectParser(tika).parse(tgz, handler, metadata);
+ fail("Zip bomb was not detected");
+ } catch (TikaException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Make sure XML parse errors don't trigger ZIP bomb detection.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
+ */
+ @Test
+ public void testNoBombDetectedForInvalidXml() throws Exception {
+ // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ZipOutputStream zos = new ZipOutputStream(baos);
+ for (int i = 1; i <= 10; i++) {
+ zos.putNextEntry(new ZipEntry(i + ".xml"));
+ zos.closeEntry();
+ }
+ zos.finish();
+ zos.close();
+ new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
+ new Metadata());
+ }
+
+ /**
+ * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
+ * have been correctly included, and are available
+ */
+ @SuppressWarnings("deprecation")
+ @Test
+ public void testOggFlacAudio() throws Exception {
+ // The three test files should all have similar test data
+ String[] testFiles = new String[] {
+ "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
+ "testOPUS.opus"
+ };
+ MediaType[] mediaTypes = new MediaType[] {
+ MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
+ MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
+ };
+
+ // Check we can load the parsers, and they claim to do the right things
+ VorbisParser vParser = new VorbisParser();
+ assertNotNull("Parser not found for " + mediaTypes[0],
+ vParser.getSupportedTypes(new ParseContext()));
+
+ FlacParser fParser = new FlacParser();
+ assertNotNull("Parser not found for " + mediaTypes[1],
+ fParser.getSupportedTypes(new ParseContext()));
+ assertNotNull("Parser not found for " + mediaTypes[2],
+ fParser.getSupportedTypes(new ParseContext()));
+
+ OpusParser oParser = new OpusParser();
+ assertNotNull("Parser not found for " + mediaTypes[3],
+ oParser.getSupportedTypes(new ParseContext()));
+
+ // Check we found the parser
+ CompositeParser parser = (CompositeParser)tika.getParser();
+ for (MediaType mt : mediaTypes) {
+ assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) );
+ }
+
+ // Have each file parsed, and check
+ for (int i=0; i<testFiles.length; i++) {
+ String file = testFiles[i];
+ try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(
+ "/test-documents/" + file)) {
+ if (input == null) {
+ fail("Could not find test file " + file);
+ }
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser(tika).parse(input, handler, metadata);
+
+ assertEquals("Incorrect content type for " + file,
+ mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
+
+ // Check some of the common metadata
+ // Old style metadata
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ // New style metadata
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check some of the XMPDM metadata
+ if (!file.endsWith(".opus")) {
+ assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+ }
+ assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+ assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+ assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+
+ // Check some of the text
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ }
+ }
+ }
+
+ /**
+ * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
+ * list of supported parsers.
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
+ */
+ @Test
+ public void testSpecificParserList() throws Exception {
+ AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
+
+ InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
+ Metadata metadata = new Metadata();
+ parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("value", metadata.get("MyParser"));
+ }
+
+ private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
+
+ /**
+ * A test detector which always returns the type supported
+ * by the test parser
+ */
+ @SuppressWarnings("serial")
+ private static class MyDetector implements Detector {
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ return MY_MEDIA_TYPE;
+ }
+ }
+
+ @SuppressWarnings("serial")
+ private static class MyParser extends AbstractParser {
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ Set<MediaType> supportedTypes = new HashSet<MediaType>();
+ supportedTypes.add(MY_MEDIA_TYPE);
+ return supportedTypes;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
+ metadata.add("MyParser", "value");
+ }
+
+ }
+
+ /**
+ * Minimal class to encapsulate all parameters -- the main reason for
+ * its existence is to aid in debugging via its toString() method.
+ *
+ * Getters and setters intentionally not provided.
+ */
+ private static class TestParams {
+
+ public String resourceRealName;
+ public String resourceStatedName;
+ public String realType;
+ public String statedType;
+ public String expectedContentFragment;
+
+
+ private TestParams(String resourceRealName,
+ String resourceStatedName,
+ String realType,
+ String statedType,
+ String expectedContentFragment) {
+ this.resourceRealName = resourceRealName;
+ this.resourceStatedName = resourceStatedName;
+ this.realType = realType;
+ this.statedType = statedType;
+ this.expectedContentFragment = expectedContentFragment;
+ }
+
+
+ /**
+ * Produces a string like the following:
+ *
+ * <pre>
+ * Test parameters:
+ * resourceRealName = /test-documents/testEXCEL.xls
+ * resourceStatedName = null
+ * realType = application/vnd.ms-excel
+ * statedType = null
+ * expectedContentFragment = Sample Excel Worksheet
+ * </pre>
+ */
+ public String toString() {
+ return "Test parameters:\n"
+ + " resourceRealName = " + resourceRealName + "\n"
+ + " resourceStatedName = " + resourceStatedName + "\n"
+ + " realType = " + realType + "\n"
+ + " statedType = " + statedType + "\n"
+ + " expectedContentFragment = " + expectedContentFragment + "\n";
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
new file mode 100644
index 0000000..66323d3
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.digesting.CommonsDigester;
+import org.junit.Test;
+
+
+public class DigestingParserTest extends TikaTest {
+
+ private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+ private final int UNLIMITED = 1000000;//well, not really, but longer than input file
+ private final Parser p = new AutoDetectParser();
+
+ @Test
+ public void testBasic() throws Exception {
+ Map<CommonsDigester.DigestAlgorithm, String> expected =
+ new HashMap<CommonsDigester.DigestAlgorithm, String>();
+
+ expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
+ "da4c21f36b54d7acd06fcf68e974663b"+
+ "fed1d256875be58d22beacf178154cc3"+
+ "a1178cb73443deaa53aa0840324708bb");
+
+ //test each one
+ for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
+ assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ }
+
+
+ //test comma separated
+ CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
+ for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
+ CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA256,
+ CommonsDigester.DigestAlgorithm.SHA384,
+ CommonsDigester.DigestAlgorithm.SHA512}) {
+ assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ }
+
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+
+ }
+
+ @Test
+ public void testLimitedRead() throws Exception {
+ CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
+ int limit = 100;
+ byte[] bytes = new byte[limit];
+ InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
+ is.read(bytes, 0, limit);
+ is.close();
+ Metadata m = new Metadata();
+ try {
+ XMLResult xml = getXML(TikaInputStream.get(bytes),
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ } catch (TikaException e) {
+ //thrown because this is just a file fragment
+ assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
+ e.getMessage());
+ }
+ String expectedMD5 = m.get(P+"MD5");
+
+ m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testReset() throws Exception {
+ String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testNegativeMaxMarkLength() throws Exception {
+ Metadata m = new Metadata();
+ boolean ex = false;
+ try {
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
+ } catch (IllegalArgumentException e) {
+ ex = true;
+ }
+ assertTrue("Exception not thrown", ex);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
new file mode 100644
index 0000000..71c07b7
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ParsingReaderTest {
+
+ @Test
+ public void testPlainText() throws Exception {
+ String data = "test content";
+ InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+ Reader reader = new ParsingReader(stream, "test.txt");
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('s', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(' ', reader.read());
+ assertEquals('c', reader.read());
+ assertEquals('o', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('\n', reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+ @Test
+ public void testXML() throws Exception {
+ String data = "<p>test <span>content</span></p>";
+ InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+ Reader reader = new ParsingReader(stream, "test.xml");
+ assertEquals(' ', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('s', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
+ assertEquals('c', (char) reader.read());
+ assertEquals('o', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('\n', (char) reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+ /**
+ * Test case for TIKA-203
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
+ */
+ @Test
+ public void testMetadata() throws Exception {
+ Metadata metadata = new Metadata();
+ InputStream stream = ParsingReaderTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL.xls");
+ try (Reader reader = new ParsingReader(
+ new AutoDetectParser(), stream, metadata, new ParseContext())) {
+ // Metadata should already be available
+ assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
+ // Check that the internal buffering isn't broken
+ assertEquals('F', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('u', (char) reader.read());
+ assertEquals('i', (char) reader.read());
+ assertEquals('l', (char) reader.read());
+ assertEquals('1', (char) reader.read());
+ }
+ }
+
+}