You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/22 02:19:17 UTC
[05/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
deleted file mode 100644
index c3d13b7..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.mime;
-
-// Junit imports
-import static java.nio.charset.StandardCharsets.UTF_16BE;
-import static java.nio.charset.StandardCharsets.UTF_16LE;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- *
- * Test Suite for the {@link MimeTypes} repository.
- *
- */
-public class TestMimeTypes {
-
- private Tika tika;
-
- private MimeTypes repo;
-
- private URL u;
-
- private static final File f = new File("/a/b/c/x.pdf");
-
- @Before
- public void setUp() throws Exception{
- TikaConfig config = TikaConfig.getDefaultConfig();
- repo = config.getMimeRepository();
- tika = new Tika(config);
- u = new URL("http://mydomain.com/x.pdf?x=y");
- }
-
- @Test
- public void testCaseSensitivity() {
- String type = tika.detect("test.PDF");
- assertNotNull(type);
- assertEquals(type, tika.detect("test.pdf"));
- assertEquals(type, tika.detect("test.PdF"));
- assertEquals(type, tika.detect("test.pdF"));
- }
-
- @Test
- public void testLoadMimeTypes() throws MimeTypeException {
- assertNotNull(repo.forName("application/octet-stream"));
- assertNotNull(repo.forName("text/x-tex"));
- }
-
- /**
- * Tests MIME type determination based solely on the URL's extension.
- */
- @Test
- public void testGuessMimeTypes() throws Exception {
- assertTypeByName("application/pdf", "x.pdf");
- assertEquals("application/pdf", tika.detect(u.toExternalForm()));
- assertEquals("application/pdf", tika.detect(f.getPath()));
- assertTypeByName("text/plain", "x.txt");
- assertTypeByName("text/html", "x.htm");
- assertTypeByName("text/html", "x.html");
- assertTypeByName("application/xhtml+xml", "x.xhtml");
- assertTypeByName("application/xml", "x.xml");
- assertTypeByName("application/zip", "x.zip");
- assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
- assertTypeByName("application/octet-stream", "x.unknown");
-
- // Test for the MS Office media types and file extensions listed in
- // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
- assertTypeByName("application/msword", "x.doc");
- assertTypeByName("application/msword", "x.dot");
- assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
- assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
- assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
- assertTypeByName("application/vnd.ms-excel", "x.xls");
- assertTypeByName("application/vnd.ms-excel", "x.xlt");
- assertTypeByName("application/vnd.ms-excel", "x.xla");
- assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
- assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
- assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
- assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
- assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
- assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
- assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
- assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
- assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
- assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
- assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
- assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
- assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testOLE2Detection() throws Exception {
- // These have the properties block near the start, so our mime
- // magic will spot them
- assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
-
- // This one quite legitimately doesn't have its properties block
- // as one of the first couple of entries
- // As such, our mime magic can't figure it out...
- assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
- assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
-
-
- // By name + data:
-
- // Those we got right to start with are fine
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
-
- // And the name lets us specialise the generic OOXML
- // ones to their actual type
- assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
- assertTypeByNameAndData("application/msword", "testWORD.doc");
- }
-
- /**
- * Files generated by Works 7.0 Spreadsheet application use the OLE2
- * structure and resemble Excel files (they contain a "Workbook"). They are
- * not Excel though. They are distinguished from Excel files with an
- * additional top-level entry in below the root of the POI filesystem.
- *
- * @throws Exception
- */
- @Test
- public void testWorksSpreadsheetDetection() throws Exception {
- assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
- // with name-only, everything should be all right
- "application/x-tika-msworks-spreadsheet",
- // this is possible due to MimeTypes guessing the type
- // based on the WksSSWorkBook near the beginning of the
- // file
- "application/x-tika-msworks-spreadsheet",
- // this is right, the magic-based detection works, there is
- // no need for the name-based detection to refine it
- "application/x-tika-msworks-spreadsheet");
- }
-
- @Test
- public void testStarOfficeDetection() throws Exception {
- assertTypeDetection("testVORCalcTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertTypeDetection("testVORDrawTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertTypeDetection("testVORImpressTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertTypeDetection("testVORWriterTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
-
- assertTypeDetection("testStarOffice-5.2-calc.sdc",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertTypeDetection("testStarOffice-5.2-draw.sda",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertTypeDetection("testStarOffice-5.2-impress.sdd",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertTypeDetection("testStarOffice-5.2-writer.sdw",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
- }
-
- /**
- * Files generated by Works Word Processor versions 3.0 and 4.0 use the
- * OLE2 structure. They don't resemble Word though.
- *
- * @throws Exception
- */
- @Test
- public void testOldWorksWordProcessorDetection() throws Exception {
- assertTypeDetection(
- "testWORKSWordProcessor3.0.wps",
- // .wps is just like any other works extension
- "application/vnd.ms-works",
- // this is due to MatOST substring
- "application/vnd.ms-works",
- // magic-based detection works, no need to refine it
- "application/vnd.ms-works");
-
- // files in version 4.0 are no different from those in version 3.0
- assertTypeDetection(
- "testWORKSWordProcessor4.0.wps",
- "application/vnd.ms-works",
- "application/vnd.ms-works",
- "application/vnd.ms-works");
- }
-
- /**
- * Files from Excel 2 through 4 are based on the BIFF record
- * structure, but without a wrapping OLE2 structure.
- * Excel 5 and Excel 95+ work on OLE2
- */
- @Test
- public void testOldExcel() throws Exception {
- // With just a name, we'll think everything's a new Excel file
- assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
- assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
- assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
-
- // With data, we can work out if it's old or new style
- assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
- assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
- assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
-
- assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testOoxmlDetection() throws Exception {
- // These two do luckily have [Content_Types].xml near the start,
- // so our mime magic will spot them
- assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
- assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
-
- // This one quite legitimately doesn't have its [Content_Types].xml
- // file as one of the first couple of entries
- // As such, our mime magic can't figure it out...
- assertTypeByData("application/zip", "testWORD.docx");
-
- // If we give the filename as well as the data, we can
- // specialise the ooxml generic one to the correct type
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
-
- // Test a few of the less usual ones
- assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
- }
-
- /**
- * Note - container based formats, needs container detection
- * to be properly correct
- */
- @Test
- public void testVisioDetection() throws Exception {
- // By Name, should get it right
- assertTypeByName("application/vnd.visio", "testVISIO.vsd");
- assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
- assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
- assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
- assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
- assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
- assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
-
- // By Name and Data, should get it right
- assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
- assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
- assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
-
- // By Data only, will get the container parent
- assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testIWorkDetection() throws Exception {
- // By name is easy
- assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
- assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
- assertTypeByName("application/vnd.apple.pages", "testPages.pages");
-
- // We can't do it by data, as we'd need to unpack
- // the zip file to check the XML
- assertTypeByData("application/zip", "testKeynote.key");
-
- assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
- assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
- assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
- }
-
- @Test
- public void testArchiveDetection() throws Exception {
- assertTypeByName("application/x-archive", "test.ar");
- assertTypeByName("application/zip", "test.zip");
- assertTypeByName("application/x-tar", "test.tar");
- assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
- assertTypeByName("application/x-cpio", "test.cpio");
-
- // TODO Add an example .deb and .udeb, then check these
-
- // Check the mime magic patterns for them work too
- assertTypeByData("application/x-archive", "testARofText.ar");
- assertTypeByData("application/x-archive", "testARofSND.ar");
- assertTypeByData("application/zip", "test-documents.zip");
- assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
- assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
- assertTypeByData("application/x-cpio", "test-documents.cpio");
-
- // For spanned zip files, the .zip file doesn't have the header, it's the other parts
- assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
- assertTypeByData("application/zip", "test-documents-spanned.z01");
- }
-
- @Test
- public void testFeedsDetection() throws Exception {
- assertType("application/rss+xml", "rsstest.rss");
- assertType("application/atom+xml", "testATOM.atom");
- assertTypeByData("application/rss+xml", "rsstest.rss");
- assertTypeByName("application/rss+xml", "rsstest.rss");
- assertTypeByData("application/atom+xml", "testATOM.atom");
- assertTypeByName("application/atom+xml", "testATOM.atom");
- }
-
- @Test
- public void testFitsDetection() throws Exception {
- // FITS image created using imagemagick convert of testJPEG.jpg
- assertType("application/fits", "testFITS.fits");
- assertTypeByData("application/fits", "testFITS.fits");
- assertTypeByName("application/fits", "testFITS.fits");
- }
-
- @Test
- public void testJpegDetection() throws Exception {
- assertType("image/jpeg", "testJPEG.jpg");
- assertTypeByData("image/jpeg", "testJPEG.jpg");
- assertTypeByName("image/jpeg", "x.jpg");
- assertTypeByName("image/jpeg", "x.JPG");
- assertTypeByName("image/jpeg", "x.jpeg");
- assertTypeByName("image/jpeg", "x.JPEG");
- assertTypeByName("image/jpeg", "x.jpe");
- assertTypeByName("image/jpeg", "x.jif");
- assertTypeByName("image/jpeg", "x.jfif");
- assertTypeByName("image/jpeg", "x.jfi");
-
- assertType("image/jp2", "testJPEG.jp2");
- assertTypeByData("image/jp2", "testJPEG.jp2");
- assertTypeByName("image/jp2", "x.jp2");
- }
-
- @Test
- public void testBpgDetection() throws Exception {
- assertType("image/x-bpg", "testBPG.bpg");
- assertTypeByData("image/x-bpg", "testBPG.bpg");
- assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
- assertTypeByName("image/x-bpg", "x.bpg");
- }
-
- @Test
- public void testTiffDetection() throws Exception {
- assertType("image/tiff", "testTIFF.tif");
- assertTypeByData("image/tiff", "testTIFF.tif");
- assertTypeByName("image/tiff", "x.tiff");
- assertTypeByName("image/tiff", "x.tif");
- assertTypeByName("image/tiff", "x.TIF");
- }
-
- @Test
- public void testGifDetection() throws Exception {
- assertType("image/gif", "testGIF.gif");
- assertTypeByData("image/gif", "testGIF.gif");
- assertTypeByName("image/gif", "x.gif");
- assertTypeByName("image/gif", "x.GIF");
- }
-
- @Test
- public void testPngDetection() throws Exception {
- assertType("image/png", "testPNG.png");
- assertTypeByData("image/png", "testPNG.png");
- assertTypeByName("image/png", "x.png");
- assertTypeByName("image/png", "x.PNG");
- }
-
- @Test
- public void testWEBPDetection() throws Exception {
- assertType("image/webp", "testWEBP.webp");
- assertTypeByData("image/webp", "testWEBP.webp");
- assertTypeByName("image/webp", "x.webp");
- assertTypeByName("image/webp", "x.WEBP");
- }
-
- @Test
- public void testBmpDetection() throws Exception {
- assertType("image/x-ms-bmp", "testBMP.bmp");
- assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
- assertTypeByName("image/x-ms-bmp", "x.bmp");
- assertTypeByName("image/x-ms-bmp", "x.BMP");
- assertTypeByName("image/x-ms-bmp", "x.dib");
- assertTypeByName("image/x-ms-bmp", "x.DIB");
- //false positive check -- contains part of BMP signature
- assertType("text/plain", "testBMPfp.txt");
- }
-
- @Test
- public void testPnmDetection() throws Exception {
- assertType("image/x-portable-bitmap", "testPBM.pbm");
- assertType("image/x-portable-graymap", "testPGM.pgm");
- assertType("image/x-portable-pixmap", "testPPM.ppm");
- assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
- assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
- assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
- assertTypeByName("image/x-portable-anymap", "x.pnm");
- assertTypeByName("image/x-portable-anymap", "x.PNM");
- assertTypeByName("image/x-portable-bitmap", "x.pbm");
- assertTypeByName("image/x-portable-bitmap", "x.PBM");
- assertTypeByName("image/x-portable-graymap", "x.pgm");
- assertTypeByName("image/x-portable-graymap", "x.PGM");
- assertTypeByName("image/x-portable-pixmap", "x.ppm");
- assertTypeByName("image/x-portable-pixmap", "x.PPM");
- }
-
- @Test
- public void testPictDetection() throws Exception {
- assertType("image/x-pict", "testPICT.pct");
- assertTypeByData("image/x-pict", "testPICT.pct");
- assertTypeByName("image/x-pict", "x.pic");
- assertTypeByName("image/x-pict", "x.PCT");
- }
-
- @Test
- public void testCgmDetection() throws Exception {
- // TODO: Need a test image file
- assertTypeByName("image/cgm", "x.cgm");
- assertTypeByName("image/cgm", "x.CGM");
- }
-
- @Test
- public void testRdfXmlDetection() throws Exception {
- assertTypeByName("application/rdf+xml", "x.rdf");
- assertTypeByName("application/rdf+xml", "x.owl");
- }
-
- @Test
- public void testSvgDetection() throws Exception {
- assertType("image/svg+xml", "testSVG.svg");
- assertTypeByData("image/svg+xml", "testSVG.svg");
- assertTypeByName("image/svg+xml", "x.svg");
- assertTypeByName("image/svg+xml", "x.SVG");
-
- // Should *.svgz be svg or gzip
- assertType("application/gzip", "testSVG.svgz");
- assertTypeByData("application/gzip", "testSVG.svgz");
- assertTypeByName("image/svg+xml", "x.svgz");
- assertTypeByName("image/svg+xml", "x.SVGZ");
- }
-
- @Test
- public void testPdfDetection() throws Exception {
- // PDF extension by name is enough
- assertTypeByName("application/pdf", "x.pdf");
- assertTypeByName("application/pdf", "x.PDF");
-
- // For normal PDFs, can get by name or data or both
- assertType("application/pdf", "testPDF.pdf");
- assertTypeByData("application/pdf", "testPDF.pdf");
-
- // PDF with a BoM works both ways too
- assertType("application/pdf", "testPDF_bom.pdf");
- assertTypeByData("application/pdf", "testPDF_bom.pdf");
- }
-
- @Test
- public void testSwfDetection() throws Exception {
- assertTypeByName("application/x-shockwave-flash", "x.swf");
- assertTypeByName("application/x-shockwave-flash", "x.SWF");
- assertTypeByName("application/x-shockwave-flash", "test1.swf");
- assertTypeByName("application/x-shockwave-flash", "test2.swf");
- assertTypeByName("application/x-shockwave-flash", "test3.swf");
- }
-
- @Test
- public void testDwgDetection() throws Exception {
- assertTypeByName("image/vnd.dwg", "x.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
- }
-
- @Test
- public void testprtDetection() throws Exception {
- assertTypeByName("application/x-prt", "x.prt");
- assertTypeByData("application/x-prt", "testCADKEY.prt");
- }
-
- /**
- * Formats which are based on plain text
- */
- @Test
- public void testTextBasedFormatsDetection() throws Exception {
- assertTypeByName("text/plain", "testTXT.txt");
- assertType( "text/plain", "testTXT.txt");
-
- assertTypeByName("text/css", "testCSS.css");
- assertType( "text/css", "testCSS.css");
-
- assertTypeByName("text/csv", "testCSV.csv");
- assertType( "text/csv", "testCSV.csv");
-
- assertTypeByName("text/html", "testHTML.html");
- assertType( "text/html", "testHTML.html");
-
- assertTypeByName("application/javascript", "testJS.js");
- assertType( "application/javascript", "testJS.js");
- }
-
- @Test
- public void testJavaDetection() throws Exception {
- // TODO Classloader doesn't seem to find the .class file in test-documents
- //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
-
- // OSX Native Extension
- assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
- }
-
- @Test
- public void testXmlAndHtmlDetection() throws Exception {
- assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
- .getBytes(UTF_8));
- assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
- .getBytes(UTF_16LE));
- assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
- .getBytes(UTF_16BE));
- assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
- .getBytes(UTF_8));
- assertTypeByData("text/html", "<html><body>HTML</body></html>"
- .getBytes(UTF_8));
- assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
- .getBytes(UTF_8));
- }
-
- @Test
- public void testWmfDetection() throws Exception {
- assertTypeByName("application/x-msmetafile", "x.wmf");
- assertTypeByData("application/x-msmetafile", "testWMF.wmf");
- assertTypeByName("application/x-msmetafile", "x.WMF");
-
- assertTypeByName("application/x-emf", "x.emf");
- assertTypeByData("application/x-emf","testEMF.emf");
- assertTypeByName("application/x-emf", "x.EMF");
- // TODO: Need a test wmz file
- assertTypeByName("application/x-ms-wmz", "x.wmz");
- assertTypeByName("application/x-ms-wmz", "x.WMZ");
- // TODO: Need a test emz file
- assertTypeByName("application/gzip", "x.emz");
- assertTypeByName("application/gzip", "x.EMZ");
- }
-
- @Test
- public void testPsDetection() throws Exception {
- // TODO: Need a test postscript file
- assertTypeByName("application/postscript", "x.ps");
- assertTypeByName("application/postscript", "x.PS");
- assertTypeByName("application/postscript", "x.eps");
- assertTypeByName("application/postscript", "x.epsf");
- assertTypeByName("application/postscript", "x.epsi");
- }
-
- @Test
- public void testMicrosoftMultiMediaDetection() throws Exception {
- assertTypeByName("video/x-ms-asf", "x.asf");
- assertTypeByName("video/x-ms-wmv", "x.wmv");
- assertTypeByName("audio/x-ms-wma", "x.wma");
-
- assertTypeByData("video/x-ms-asf", "testASF.asf");
- assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
- assertTypeByData("audio/x-ms-wma", "testWMA.wma");
- }
-
- /**
- * All 3 DITA types are in theory handled by the same mimetype,
- * but we specialise them
- */
- @Test
- public void testDITADetection() throws Exception {
- assertTypeByName("application/dita+xml; format=topic", "test.dita");
- assertTypeByName("application/dita+xml; format=map", "test.ditamap");
- assertTypeByName("application/dita+xml; format=val", "test.ditaval");
-
- assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
- assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
- assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
-
- assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
- assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
- assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
-
- // These are all children of the official type
- assertEquals("application/dita+xml",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
- assertEquals("application/dita+xml",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
- // Concept inherits from topic
- assertEquals("application/dita+xml; format=topic",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
- }
-
- /**
- * @since TIKA-194
- */
- @Test
- public void testJavaRegex() throws Exception{
- MimeType testType = new MimeType(MediaType.parse("foo/bar"));
- this.repo.add(testType);
- assertNotNull(repo.forName("foo/bar"));
- String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
- this.repo.addPattern(testType, pattern, true);
- String testFileName = "rtg_sst_grb_0.5.12345678";
- assertEquals("foo/bar", tika.detect(testFileName));
-
- MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
- this.repo.add(testType2);
- assertNotNull(repo.forName("foo/bar2"));
- this.repo.addPattern(testType2, pattern, false);
- assertNotSame("foo/bar2", tika.detect(testFileName));
- }
-
- @Test
- public void testRawDetection() throws Exception {
- assertTypeByName("image/x-raw-adobe", "x.dng");
- assertTypeByName("image/x-raw-adobe", "x.DNG");
- assertTypeByName("image/x-raw-hasselblad", "x.3fr");
- assertTypeByName("image/x-raw-fuji", "x.raf");
- assertTypeByName("image/x-raw-canon", "x.crw");
- assertTypeByName("image/x-raw-canon", "x.cr2");
- assertTypeByName("image/x-raw-kodak", "x.k25");
- assertTypeByName("image/x-raw-kodak", "x.kdc");
- assertTypeByName("image/x-raw-kodak", "x.dcs");
- assertTypeByName("image/x-raw-kodak", "x.drf");
- assertTypeByName("image/x-raw-minolta", "x.mrw");
- assertTypeByName("image/x-raw-nikon", "x.nef");
- assertTypeByName("image/x-raw-nikon", "x.nrw");
- assertTypeByName("image/x-raw-olympus", "x.orf");
- assertTypeByName("image/x-raw-pentax", "x.ptx");
- assertTypeByName("image/x-raw-pentax", "x.pef");
- assertTypeByName("image/x-raw-sony", "x.arw");
- assertTypeByName("image/x-raw-sony", "x.srf");
- assertTypeByName("image/x-raw-sony", "x.sr2");
- assertTypeByName("image/x-raw-sigma", "x.x3f");
- assertTypeByName("image/x-raw-epson", "x.erf");
- assertTypeByName("image/x-raw-mamiya", "x.mef");
- assertTypeByName("image/x-raw-leaf", "x.mos");
- assertTypeByName("image/x-raw-panasonic", "x.raw");
- assertTypeByName("image/x-raw-panasonic", "x.rw2");
- assertTypeByName("image/x-raw-phaseone", "x.iiq");
- assertTypeByName("image/x-raw-red", "x.r3d");
- assertTypeByName("image/x-raw-imacon", "x.fff");
- assertTypeByName("image/x-raw-logitech", "x.pxn");
- assertTypeByName("image/x-raw-casio", "x.bay");
- assertTypeByName("image/x-raw-rawzor", "x.rwz");
- }
-
- /**
- * Tests that we correctly detect the font types
- */
- @Test
- public void testFontDetection() throws Exception {
- assertTypeByName("application/x-font-adobe-metric", "x.afm");
- assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
-
- assertTypeByName("application/x-font-printer-metric", "x.pfm");
- // TODO Get a sample .pfm file
- assertTypeByData(
- "application/x-font-printer-metric",
- new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
- 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
- );
-
- assertTypeByName("application/x-font-type1", "x.pfa");
- // TODO Get a sample .pfa file
- assertTypeByData(
- "application/x-font-type1",
- new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
- 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
- 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
- );
-
- assertTypeByName("application/x-font-type1", "x.pfb");
- // TODO Get a sample .pfm file
- assertTypeByData(
- "application/x-font-type1",
- new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
- 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
- 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
- );
- }
-
- /**
- * Tests MimeTypes.getMimeType(URL), which examines both the byte header
- * and, if necessary, the URL's extension.
- */
- @Test
- public void testMimeDeterminationForTestDocuments() throws Exception {
- assertType("text/html", "testHTML.html");
- assertType("application/zip", "test-documents.zip");
-
- assertType("text/html", "testHTML_utf8.html");
- assertType(
- "application/vnd.oasis.opendocument.text",
- "testOpenOffice2.odt");
- assertType("application/pdf", "testPDF.pdf");
- assertType("application/rtf", "testRTF.rtf");
- assertType("text/plain", "testTXT.txt");
- assertType("application/xml", "testXML.xml");
- assertType("audio/basic", "testAU.au");
- assertType("audio/x-aiff", "testAIFF.aif");
- assertType("audio/x-wav", "testWAV.wav");
- assertType("audio/midi", "testMID.mid");
- assertType("application/x-msaccess", "testACCESS.mdb");
- assertType("application/x-font-ttf", "testTrueType3.ttf");
- }
-
- @Test
- public void test7ZipDetection() throws Exception {
- assertTypeByName("application/x-7z-compressed","test-documents.7z");
- assertTypeByData("application/x-7z-compressed","test-documents.7z");
- assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
- }
-
- @Test
- public void testWebArchiveDetection() throws Exception {
- assertTypeByName("application/x-webarchive","x.webarchive");
- assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
- assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
- }
-
- /**
- * KML, and KMZ (zipped KML)
- */
- @Test
- public void testKMLZDetection() throws Exception {
- assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
- assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
- assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
-
- assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
- assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
-
- // By data only, mimetype magic only gets us to a .zip
- // We need to use the Zip Aware detector to get the full type
- assertTypeByData("application/zip","testKMZ.kmz");
- }
-
- @Test
- public void testCreativeSuite() throws IOException {
- assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
- assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
- }
-
- @Test
- public void testAMR() throws IOException {
- // AMR matches on name, data or both
- assertTypeDetection("testAMR.amr", "audio/amr");
-
- // AMR-WB subtype shares extension, so needs data to identify
- assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
-
- // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
- //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
- }
-
- @Test
- public void testEmail() throws IOException {
- // EMLX
- assertTypeDetection("testEMLX.emlx", "message/x-emlx");
-
- // Groupwise
- assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
-
- // Lotus
- assertTypeDetection("testLotusEml.eml", "message/rfc822");
-
- // Thunderbird - doesn't currently work by name
- assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
- }
-
- @Test
- public void testAxCrypt() throws Exception {
- // test-TXT.txt encrypted with a key of "tika"
- assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
- }
-
- @Test
- public void testWindowsEXE() throws Exception {
- assertTypeByName("application/x-msdownload", "x.dll");
- assertTypeByName("application/x-ms-installer", "x.msi");
- assertTypeByName("application/x-dosexec", "x.exe");
-
- assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
- assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
-
- // A jar file with part of a PE header, but not a full one
- // should still be detected as a zip or jar (without/with name)
- assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
- assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
- }
-
- @Test
- public void testMatroskaDetection() throws Exception {
- assertType("video/x-matroska", "testMKV.mkv");
- // TODO: Need custom detector data detection, see TIKA-1180
- assertTypeByData("application/x-matroska", "testMKV.mkv");
- assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
- assertTypeByName("video/x-matroska", "x.mkv");
- assertTypeByName("video/x-matroska", "x.MKV");
- assertTypeByName("audio/x-matroska", "x.mka");
- assertTypeByName("audio/x-matroska", "x.MKA");
- }
-
- @Test
- public void testWebMDetection() throws Exception {
- assertType("video/webm", "testWEBM.webm");
- // TODO: Need custom detector data detection, see TIKA-1180
- assertTypeByData("application/x-matroska", "testWEBM.webm");
- assertTypeByNameAndData("video/webm", "testWEBM.webm");
- assertTypeByName("video/webm", "x.webm");
- assertTypeByName("video/webm", "x.WEBM");
- }
-
- /** Test getMimeType(byte[]) */
- @Test
- public void testGetMimeType_byteArray() throws IOException {
- // Plain text detection
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
- assertText(new byte[] { 'a', 'b', 'c' });
- assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
- assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
- }
-
- @Test
- public void testBerkeleyDB() throws IOException {
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=2",
- "testBDB_btree_2.db");
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=3",
- "testBDB_btree_3.db");
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=4",
- "testBDB_btree_4.db");
- // V4 and V5 share the same btree format
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=4",
- "testBDB_btree_5.db");
-
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=2",
- "testBDB_hash_2.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=3",
- "testBDB_hash_3.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=4",
- "testBDB_hash_4.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=5",
- "testBDB_hash_5.db");
- }
-
- /**
- * CBOR typically contains HTML
- */
- @Test
- public void testCBOR() throws IOException {
- assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
- assertTypeByData("application/cbor", "NUTCH-1997.cbor");
- }
-
- @Test
- public void testZLIB() throws IOException {
- // ZLIB encoded versions of testTXT.txt
- assertTypeByData("application/zlib", "testTXT.zlib");
- assertTypeByData("application/zlib", "testTXT.zlib0");
- assertTypeByData("application/zlib", "testTXT.zlib5");
- assertTypeByData("application/zlib", "testTXT.zlib9");
- }
-
- @Test
- public void testTextFormats() throws Exception {
- assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
- assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
- }
-
- @Test
- public void testCodeFormats() throws Exception {
- assertType("text/x-csrc", "testC.c");
- assertType("text/x-chdr", "testH.h");
- assertTypeByData("text/x-csrc", "testC.c");
- assertTypeByData("text/x-chdr", "testH.h");
-
- assertTypeByName("text/x-java-source", "testJAVA.java");
- assertType("text/x-java-properties", "testJAVAPROPS.properties");
-
- assertType("text/x-matlab", "testMATLAB.m");
- assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
- assertType("text/x-matlab", "testMATLAB_barcast.m");
- assertTypeByData("text/x-matlab", "testMATLAB.m");
- assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
- assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
- }
-
- @Test
- public void testWebVTT() throws Exception {
- assertType("text/vtt", "testWebVTT.vtt");
- assertTypeByData("text/vtt", "testWebVTT.vtt");
- }
-
- private void assertText(byte[] prefix) throws IOException {
- assertMagic("text/plain", prefix);
- }
-
- private void assertNotText(byte[] prefix) throws IOException {
- assertMagic("application/octet-stream", prefix);
- }
-
- private void assertMagic(String expected, byte[] prefix) throws IOException {
- MediaType type =
- repo.detect(new ByteArrayInputStream(prefix), new Metadata());
- assertNotNull(type);
- assertEquals(expected, type.toString());
- }
-
- private void assertType(String expected, String filename) throws Exception {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test file not found: " + filename, stream);
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeByName(String expected, String filename)
- throws IOException {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- assertEquals(expected, repo.detect(null, metadata).toString());
- }
-
- private void assertTypeByData(String expected, String filename)
- throws IOException {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test file not found: " + filename, stream);
- Metadata metadata = new Metadata();
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeByData(String expected, byte[] data)
- throws IOException {
- try (InputStream stream = new ByteArrayInputStream(data)) {
- Metadata metadata = new Metadata();
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeDetection(String filename, String type)
- throws IOException {
- assertTypeDetection(filename, type, type, type);
- }
-
- private void assertTypeDetection(String filename, String byName, String byData,
- String byNameAndData) throws IOException {
- assertTypeByName(byName, filename);
- assertTypeByData(byData, filename);
- assertTypeByNameAndData(byNameAndData, filename);
- }
-
- private void assertTypeByNameAndData(String expected, String filename)
- throws IOException {
- assertEquals(expected, getTypeByNameAndData(filename).toString());
- }
-
- private MediaType getTypeByNameAndData(String filename) throws IOException {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test document not found: " + filename, stream);
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- return repo.detect(stream, metadata);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
deleted file mode 100644
index 91b054e..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.BodyContentHandler;
-import org.gagravarr.tika.FlacParser;
-import org.gagravarr.tika.OpusParser;
-import org.gagravarr.tika.VorbisParser;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class AutoDetectParserTest {
- private TikaConfig tika = TikaConfig.getDefaultConfig();
-
- // Easy to read constants for the MIME types:
- private static final String RAW = "application/octet-stream";
- private static final String EXCEL = "application/vnd.ms-excel";
- private static final String HTML = "text/html; charset=ISO-8859-1";
- private static final String PDF = "application/pdf";
- private static final String POWERPOINT = "application/vnd.ms-powerpoint";
- private static final String KEYNOTE = "application/vnd.apple.keynote";
- private static final String PAGES = "application/vnd.apple.pages";
- private static final String NUMBERS = "application/vnd.apple.numbers";
- private static final String CHM = "application/vnd.ms-htmlhelp";
- private static final String RTF = "application/rtf";
- private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
- private static final String UTF8TEXT = "text/plain; charset=UTF-8";
- private static final String WORD = "application/msword";
- private static final String XML = "application/xml";
- private static final String RSS = "application/rss+xml";
- private static final String BMP = "image/x-ms-bmp";
- private static final String GIF = "image/gif";
- private static final String JPEG = "image/jpeg";
- private static final String PNG = "image/png";
- private static final String OGG_VORBIS = "audio/vorbis";
- private static final String OGG_OPUS = "audio/opus";
- private static final String OGG_FLAC = "audio/x-oggflac";
- private static final String FLAC_NATIVE= "audio/x-flac";
- private static final String OPENOFFICE
- = "application/vnd.oasis.opendocument.text";
-
-
- /**
- * This is where a single test is done.
- * @param tp the parameters encapsulated in a TestParams instance
- * @throws IOException
- */
- private void assertAutoDetect(TestParams tp) throws Exception {
- try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
- if (input == null) {
- fail("Could not open stream from specified resource: "
- + tp.resourceRealName);
- }
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
- metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser(tika).parse(input, handler, metadata);
-
- assertEquals("Bad content type: " + tp,
- tp.realType, metadata.get(Metadata.CONTENT_TYPE));
-
- if (tp.expectedContentFragment != null) {
- assertTrue("Expected content not found: " + tp,
- handler.toString().contains(tp.expectedContentFragment));
- }
- }
- }
-
- /**
- * Convenience method -- its sole purpose of existence is to make the
- * call to it more readable than it would be if a TestParams instance
- * would need to be instantiated there.
- *
- * @param resourceRealName real name of resource
- * @param resourceStatedName stated name -- will a bad name fool us?
- * @param realType - the real MIME type
- * @param statedType - stated MIME type - will a wrong one fool us?
- * @param expectedContentFragment - something expected in the text
- * @throws Exception
- */
- private void assertAutoDetect(String resourceRealName,
- String resourceStatedName,
- String realType,
- String statedType,
- String expectedContentFragment)
- throws Exception {
-
- assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
- realType, statedType, expectedContentFragment));
- }
-
- private void assertAutoDetect(
- String resource, String type, String content) throws Exception {
-
- resource = "/test-documents/" + resource;
-
- // TODO !!!! The disabled tests below should work!
- // The correct MIME type should be determined regardless of the
- // stated type (ContentType hint) and the stated URL name.
-
-
- // Try different combinations of correct and incorrect arguments:
- final String wrongMimeType = RAW;
- assertAutoDetect(resource, resource, type, type, content);
- assertAutoDetect(resource, resource, type, null, content);
- assertAutoDetect(resource, resource, type, wrongMimeType, content);
-
- assertAutoDetect(resource, null, type, type, content);
- assertAutoDetect(resource, null, type, null, content);
- assertAutoDetect(resource, null, type, wrongMimeType, content);
-
- final String badResource = "a.xyz";
- assertAutoDetect(resource, badResource, type, type, content);
- assertAutoDetect(resource, badResource, type, null, content);
- assertAutoDetect(resource, badResource, type, wrongMimeType, content);
- }
-
- @Test
- public void testKeynote() throws Exception {
- assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
- }
-
- @Test
- public void testPages() throws Exception {
- assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
- }
-
- @Test
- public void testNumbers() throws Exception {
- assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
- }
-
- @Test
- public void testChm() throws Exception {
- assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
- }
-
- @Test
- public void testEpub() throws Exception {
- assertAutoDetect(
- "testEPUB.epub", "application/epub+zip",
- "The previous headings were subchapters");
- }
-
- @Test
- public void testExcel() throws Exception {
- assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
- }
-
- @Test
- public void testHTML() throws Exception {
- assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
- }
-
- @Test
- public void testOpenOffice() throws Exception {
- assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
- "This is a sample Open Office document");
- }
-
- @Test
- public void testPDF() throws Exception {
- assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
-
- }
-
- @Test
- public void testPowerpoint() throws Exception {
- assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
- }
-
- @Test
- public void testRdfXml() throws Exception {
- assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
- }
-
- @Test
- public void testRTF() throws Exception {
- assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
- }
-
- @Test
- public void testText() throws Exception {
- assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
- }
-
- @Test
- public void testTextNonASCIIUTF8() throws Exception {
- assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
- }
-
- @Test
- public void testWord() throws Exception {
- assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
- }
-
- @Test
- public void testXML() throws Exception {
- assertAutoDetect("testXML.xml", XML, "Lius");
- }
-
- @Test
- public void testRss() throws Exception {
- assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
- }
-
- @Test
- public void testImages() throws Exception {
- assertAutoDetect("testBMP.bmp", BMP, null);
- assertAutoDetect("testGIF.gif", GIF, null);
- assertAutoDetect("testJPEG.jpg", JPEG, null);
- assertAutoDetect("testPNG.png", PNG, null);
- }
-
- /**
- * Make sure that zip bomb attacks are prevented.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
- */
- @Test
- public void testZipBombPrevention() throws Exception {
- try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
- "/test-documents/TIKA-216.tgz")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler(-1);
- new AutoDetectParser(tika).parse(tgz, handler, metadata);
- fail("Zip bomb was not detected");
- } catch (TikaException e) {
- // expected
- }
- }
-
- /**
- * Make sure XML parse errors don't trigger ZIP bomb detection.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
- */
- @Test
- public void testNoBombDetectedForInvalidXml() throws Exception {
- // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- ZipOutputStream zos = new ZipOutputStream(baos);
- for (int i = 1; i <= 10; i++) {
- zos.putNextEntry(new ZipEntry(i + ".xml"));
- zos.closeEntry();
- }
- zos.finish();
- zos.close();
- new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
- new Metadata());
- }
-
- /**
- * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
- * have been correctly included, and are available
- */
- @SuppressWarnings("deprecation")
- @Test
- public void testOggFlacAudio() throws Exception {
- // The three test files should all have similar test data
- String[] testFiles = new String[] {
- "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
- "testOPUS.opus"
- };
- MediaType[] mediaTypes = new MediaType[] {
- MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
- MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
- };
-
- // Check we can load the parsers, and they claim to do the right things
- VorbisParser vParser = new VorbisParser();
- assertNotNull("Parser not found for " + mediaTypes[0],
- vParser.getSupportedTypes(new ParseContext()));
-
- FlacParser fParser = new FlacParser();
- assertNotNull("Parser not found for " + mediaTypes[1],
- fParser.getSupportedTypes(new ParseContext()));
- assertNotNull("Parser not found for " + mediaTypes[2],
- fParser.getSupportedTypes(new ParseContext()));
-
- OpusParser oParser = new OpusParser();
- assertNotNull("Parser not found for " + mediaTypes[3],
- oParser.getSupportedTypes(new ParseContext()));
-
- // Check we found the parser
- CompositeParser parser = (CompositeParser)tika.getParser();
- for (MediaType mt : mediaTypes) {
- assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) );
- }
-
- // Have each file parsed, and check
- for (int i=0; i<testFiles.length; i++) {
- String file = testFiles[i];
- try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(
- "/test-documents/" + file)) {
- if (input == null) {
- fail("Could not find test file " + file);
- }
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser(tika).parse(input, handler, metadata);
-
- assertEquals("Incorrect content type for " + file,
- mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
-
- // Check some of the common metadata
- // Old style metadata
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
- assertEquals("Test Title", metadata.get(Metadata.TITLE));
- // New style metadata
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-
- // Check some of the XMPDM metadata
- if (!file.endsWith(".opus")) {
- assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
- }
- assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
- assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
- assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
-
- // Check some of the text
- String content = handler.toString();
- assertTrue(content.contains("Test Title"));
- assertTrue(content.contains("Test Artist"));
- }
- }
- }
-
- /**
- * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
- * list of supported parsers.
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
- */
- @Test
- public void testSpecificParserList() throws Exception {
- AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
-
- InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
- Metadata metadata = new Metadata();
- parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("value", metadata.get("MyParser"));
- }
-
- private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
-
- /**
- * A test detector which always returns the type supported
- * by the test parser
- */
- @SuppressWarnings("serial")
- private static class MyDetector implements Detector {
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- return MY_MEDIA_TYPE;
- }
- }
-
- @SuppressWarnings("serial")
- private static class MyParser extends AbstractParser {
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- Set<MediaType> supportedTypes = new HashSet<MediaType>();
- supportedTypes.add(MY_MEDIA_TYPE);
- return supportedTypes;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
- metadata.add("MyParser", "value");
- }
-
- }
-
- /**
- * Minimal class to encapsulate all parameters -- the main reason for
- * its existence is to aid in debugging via its toString() method.
- *
- * Getters and setters intentionally not provided.
- */
- private static class TestParams {
-
- public String resourceRealName;
- public String resourceStatedName;
- public String realType;
- public String statedType;
- public String expectedContentFragment;
-
-
- private TestParams(String resourceRealName,
- String resourceStatedName,
- String realType,
- String statedType,
- String expectedContentFragment) {
- this.resourceRealName = resourceRealName;
- this.resourceStatedName = resourceStatedName;
- this.realType = realType;
- this.statedType = statedType;
- this.expectedContentFragment = expectedContentFragment;
- }
-
-
- /**
- * Produces a string like the following:
- *
- * <pre>
- * Test parameters:
- * resourceRealName = /test-documents/testEXCEL.xls
- * resourceStatedName = null
- * realType = application/vnd.ms-excel
- * statedType = null
- * expectedContentFragment = Sample Excel Worksheet
- * </pre>
- */
- public String toString() {
- return "Test parameters:\n"
- + " resourceRealName = " + resourceRealName + "\n"
- + " resourceStatedName = " + resourceStatedName + "\n"
- + " realType = " + realType + "\n"
- + " statedType = " + statedType + "\n"
- + " expectedContentFragment = " + expectedContentFragment + "\n";
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
deleted file mode 100644
index 68edfc2..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.utils.CommonsDigester;
-import org.junit.Test;
-
-
-public class DigestingParserTest extends TikaTest {
-
- private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
- "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
-
- private final int UNLIMITED = 1000000;//well, not really, but longer than input file
- private final Parser p = new AutoDetectParser();
-
- @Test
- public void testBasic() throws Exception {
- Map<CommonsDigester.DigestAlgorithm, String> expected =
- new HashMap<CommonsDigester.DigestAlgorithm, String>();
-
- expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
- expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
- expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
- expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
- "82bc53764a0f1430d134ae3b70c32654");
- expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
- "8b8a6923fdf251ddab72c6e4b5d54160" +
- "9db917ba4260d1767995a844d8d654df");
- expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
- "da4c21f36b54d7acd06fcf68e974663b"+
- "fed1d256875be58d22beacf178154cc3"+
- "a1178cb73443deaa53aa0840324708bb");
-
- //test each one
- for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
- }
-
-
- //test comma separated
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
- for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
- CommonsDigester.DigestAlgorithm.MD5,
- CommonsDigester.DigestAlgorithm.SHA256,
- CommonsDigester.DigestAlgorithm.SHA384,
- CommonsDigester.DigestAlgorithm.SHA512}) {
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
- }
-
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
-
- }
-
- @Test
- public void testLimitedRead() throws Exception {
- CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
- int limit = 100;
- byte[] bytes = new byte[limit];
- InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
- is.read(bytes, 0, limit);
- is.close();
- Metadata m = new Metadata();
- try {
- XMLResult xml = getXML(TikaInputStream.get(bytes),
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- } catch (TikaException e) {
- //thrown because this is just a file fragment
- assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
- e.getMessage());
- }
- String expectedMD5 = m.get(P+"MD5");
-
- m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- assertEquals(expectedMD5, m.get(P+"MD5"));
- }
-
- @Test
- public void testReset() throws Exception {
- String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
- assertEquals(expectedMD5, m.get(P+"MD5"));
- }
-
- @Test
- public void testNegativeMaxMarkLength() throws Exception {
- Metadata m = new Metadata();
- boolean ex = false;
- try {
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
- } catch (IllegalArgumentException e) {
- ex = true;
- }
- assertTrue("Exception not thrown", ex);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
deleted file mode 100644
index 2fcd1c3..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.io.Reader;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-
-public class ParsingReaderTest {
-
- @Test
- public void testPlainText() throws Exception {
- String data = "test content";
- InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
- Reader reader = new ParsingReader(stream, "test.txt");
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('s', reader.read());
- assertEquals('t', reader.read());
- assertEquals(' ', reader.read());
- assertEquals('c', reader.read());
- assertEquals('o', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('\n', reader.read());
- assertEquals(-1, reader.read());
- reader.close();
- assertEquals(-1, stream.read());
- }
-
- @Test
- public void testXML() throws Exception {
- String data = "<p>test <span>content</span></p>";
- InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
- Reader reader = new ParsingReader(stream, "test.xml");
- assertEquals(' ', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('s', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals(' ', (char) reader.read());
- assertEquals(' ', (char) reader.read());
- assertEquals('c', (char) reader.read());
- assertEquals('o', (char) reader.read());
- assertEquals('n', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('n', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('\n', (char) reader.read());
- assertEquals(-1, reader.read());
- reader.close();
- assertEquals(-1, stream.read());
- }
-
- /**
- * Test case for TIKA-203
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
- */
- @Test
- public void testMetadata() throws Exception {
- Metadata metadata = new Metadata();
- InputStream stream = ParsingReaderTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xls");
- try (Reader reader = new ParsingReader(
- new AutoDetectParser(), stream, metadata, new ParseContext())) {
- // Metadata should already be available
- assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
- // Check that the internal buffering isn't broken
- assertEquals('F', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('u', (char) reader.read());
- assertEquals('i', (char) reader.read());
- assertEquals('l', (char) reader.read());
- assertEquals('1', (char) reader.read());
- }
- }
-
-}