You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/12/02 17:00:27 UTC
[tika] branch main updated: TIKA-3241 -- fix git add problems, replace some test documents that were modified in the move, update poms

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new a43784b  TIKA-3241 -- fix git add problems, replace some test documents that were  modified in the move, update poms
a43784b is described below

commit a43784b19f6b0955478dded71521b0491d21c90b
Author: tallison <ta...@apache.org>
AuthorDate: Wed Dec 2 11:58:44 2020 -0500

    TIKA-3241 -- fix git add problems, replace some test documents that were  modified in the move, update poms
---
 .../tika-parser-apple-module/pom.xml               |     2 +-
 .../tika/parser/apple/AppleSingleFileParser.java   |   192 +
 .../apache/tika/parser/apple/BPListDetector.java   |   138 +
 .../tika/parser/iwork/IWorkPackageParser.java      |   220 +
 .../tika/parser/iwork/KeynoteContentHandler.java   |   176 +
 .../tika/parser/iwork/NumbersContentHandler.java   |   232 +
 .../services/org.apache.tika.detect.Detector       |    16 +
 .../services/org.apache.tika.parser.Parser         |    22 +
 .../tika/parser/iwork/AutoPageNumberUtilsTest.java |    79 +
 .../apache/tika/parser/iwork/IWorkParserTest.java  |   392 +
 .../test-documents/testAppleSingleFile.pdf         |   Bin 0 -> 1893 bytes
 .../resources/test-documents/testKeynote2013.key   |   Bin 0 -> 274397 bytes
 .../test-documents/testMasterSlideTable.key        |   Bin 0 -> 220184 bytes
 .../resources/test-documents/testNumbers.numbers   |   Bin 0 -> 134571 bytes
 .../test-documents/testNumbers2013.numbers         |   Bin 0 -> 179147 bytes
 .../resources/test-documents/testPages2013.pages   |   Bin 0 -> 237567 bytes
 .../test-documents/testPagesComments.pages         |   Bin 0 -> 154546 bytes
 .../testPagesHeadersFootersFootnotes.pages         |   Bin 0 -> 177328 bytes
 .../test-documents/testPagesPwdProtected.pages     |   Bin 0 -> 33166 bytes
 .../test-documents/testWEBARCHIVE.webarchive       |   646 +
 .../tika-parser-audiovideo-module/pom.xml          |     3 +-
 .../org/apache/tika/parser/audio/MidiParser.java   |   122 +
 .../tika/parser/mp3/CompositeTagHandler.java       |   142 +
 .../java/org/apache/tika/parser/mp3/ID3Tags.java   |   254 +
 .../org/apache/tika/parser/mp3/ID3v1Handler.java   |   183 +
 .../org/apache/tika/parser/mp3/ID3v22Handler.java  |   159 +
 .../org/apache/tika/parser/mp3/ID3v23Handler.java  |   138 +
 .../org/apache/tika/parser/mp3/ID3v2Frame.java     |   430 +
 .../org/apache/tika/parser/mp3/LyricsHandler.java  |   156 +
 .../java/org/apache/tika/parser/mp3/Mp3Parser.java |   256 +
 .../services/org.apache.tika.parser.Parser         |    22 +
 .../apache/tika/parser/audio/AudioParserTest.java  |    75 +
 .../apache/tika/parser/audio/MidiParserTest.java   |    42 +
 .../org/apache/tika/parser/mp3/Mp3ParserTest.java  |   368 +
 .../org/apache/tika/parser/mp3/MpegStreamTest.java |   166 +
 .../src/test/resources/test-documents/test2.mp3    |   Bin 0 -> 2668637 bytes
 .../src/test/resources/test-documents/testAIFF.aif |   Bin 0 -> 3894 bytes
 .../src/test/resources/test-documents/testAU.au    |   Bin 0 -> 3868 bytes
 .../src/test/resources/test-documents/testFLV.flv  |   Bin 0 -> 90580 bytes
 .../test/resources/test-documents/testMP3i18n.mp3  |   Bin 0 -> 40832 bytes
 .../resources/test-documents/testMP3id3v1_v2.mp3   |   Bin 0 -> 40960 bytes
 .../test/resources/test-documents/testMP3id3v2.mp3 |   Bin 0 -> 39577 bytes
 .../test/resources/test-documents/testMP3noid3.mp3 |   Bin 0 -> 39288 bytes
 .../resources/test-documents/testMP3truncated.mp3  |   Bin 0 -> 65536 bytes
 .../resources/test-documents/testMP4_truncated.m4a |   Bin 0 -> 74 bytes
 .../pom.xml                                        |    15 +-
 .../java/org/apache/tika/parser/prt/PRTParser.java |   275 +
 .../test/resources/test-documents/testDWG2010.dwg  |   Bin 0 -> 59562 bytes
 .../tika-parser-code-module/pom.xml                |     3 +-
 .../org/apache/tika/parser/asm/ClassParser.java    |    54 +
 .../java/org/apache/tika/parser/mat/MatParser.java |   146 +
 .../services/org.apache.tika.parser.Parser         |    21 +
 .../test-documents/AutoDetectParser.class          |   Bin 0 -> 3794 bytes
 .../breidamerkurjokull_radar_profiles_2009.mat     |   Bin 27611304 -> 14748772 bytes
 .../test-documents/test-columnar.sas7bdat          |   Bin 0 -> 131072 bytes
 .../src/test/resources/test-documents/testC.c      |     6 +
 .../test/resources/test-documents/testJS_HTML.js   |    91 +
 .../resources/test-documents/testLinux-mips-32be   |   Bin 0 -> 8125 bytes
 .../resources/test-documents/testLinux-mips-32le   |   Bin 0 -> 38051 bytes
 .../resources/test-documents/testLinux-ppc-32be    |   Bin 0 -> 248480 bytes
 .../test/resources/test-documents/testLinux-x86-32 |   Bin 0 -> 7175 bytes
 .../src/test/resources/test-documents/testMATLAB.m |     4 +
 .../resources/test-documents/testMATLAB_barcast.m  |   383 +
 .../resources/test-documents/testMATLAB_wtsgaus.m  |    52 +
 .../src/test/resources/test-documents/testSAS.sas  |    33 +
 .../test/resources/test-documents/testSAS.sas7bdat |   Bin 0 -> 17408 bytes
 .../pom.xml                                        |    20 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |    90 +
 .../test/resources/test-documents/Test1.txt.tsd    |   Bin 0 -> 4967 bytes
 .../test/resources/test-documents/Test2.txt.tsd    |   Bin 0 -> 4969 bytes
 .../src/test/resources/test-documents/testCERT.der |   Bin
 .../src/test/resources/test-documents/testCERT.pem |     0
 .../test/resources/test-documents/testDSAKEY.der   |   Bin
 .../test/resources/test-documents/testDSAKEY.pem   |     0
 .../resources/test-documents/testDSAPARAMS.pem     |     0
 .../test/resources/test-documents/testDetached.p7s |   Bin 0 -> 2941 bytes
 .../test/resources/test-documents/testECKEY.der    |   Bin
 .../test/resources/test-documents/testECKEY.pem    |     0
 .../test/resources/test-documents/testECPARAMS.pem |     0
 .../test/resources/test-documents/testRSAKEY.der   |   Bin
 .../test/resources/test-documents/testRSAKEY.pem   |     0
 .../test-documents/testTSD_broken_pdf.tsd          |   Bin 0 -> 91985 bytes
 .../pom.xml                                        |    26 +-
 .../tika/parser/digestutils/CommonsDigester.java   |   186 +
 .../pom.xml                                        |    12 +-
 .../apache/tika/parser/font/TrueTypeParser.java    |   117 +
 .../services/org.apache.tika.parser.Parser         |    18 +
 .../tika-parser-html-module/pom.xml                |     3 +-
 .../tika/parser/html/BoilerpipeContentHandler.java |   363 +
 .../org/apache/tika/parser/html/DataURIScheme.java |    77 +
 .../apache/tika/parser/html/DataURISchemeUtil.java |   103 +
 .../tika/parser/html/HtmlEncodingDetector.java     |   188 +
 .../org/apache/tika/parser/html/HtmlHandler.java   |   462 +
 .../org/apache/tika/parser/html/HtmlMapper.java    |    69 +
 .../org/apache/tika/parser/html/HtmlParser.java    |   247 +
 .../html/charsetdetector/CharsetAliases.java       |   145 +
 .../charsetdetector/CharsetDetectionResult.java    |    62 +
 .../parser/html/charsetdetector/PreScanner.java    |   270 +
 .../charsets/ReplacementCharset.java               |    65 +
 .../org.apache.tika.detect.EncodingDetector        |    15 +
 .../html/StandardCharsets_unsupported_by_IANA.txt  |   139 +
 .../tika/parser/html/DataURISchemeParserTest.java  |    77 +
 .../html/StandardHtmlEncodingDetectorTest.java     |   378 +
 .../org/apache/tika/parser/html/tika-config.xml    |    30 +
 .../resources/test-documents/big-preamble.html     |   827 ++
 .../test-documents/boilerplate-whitespace.html     |    27 +
 .../test/resources/test-documents/testHTML.html    |    28 +
 .../test-documents/testHTMLBadScript.html          |     9 +
 .../test-documents/testHTMLGoodScript.html         |     9 +
 ...ing_3.html => testHTMLNoisyMetaEncoding_1.html} |    10 +-
 ...ing_3.html => testHTMLNoisyMetaEncoding_2.html} |    10 +-
 .../testHTMLNoisyMetaEncoding_3.html               |     8 +-
 ...ing_3.html => testHTMLNoisyMetaEncoding_4.html} |     8 +-
 .../test-documents/testHTML_charset_utf16le.html   |   Bin 0 -> 380 bytes
 .../test-documents/testHTML_charset_utf8.html      |     4 +-
 .../testHTML_embedded_data_uri_js.html             |    11 +
 .../test-documents/testHTML_embedded_img.html      |   352 +
 .../resources/test-documents/testHTML_utf8.html    |    25 +
 .../src/test/resources/test-documents/tika434.html |   914 ++
 .../pom.xml                                        |    59 +-
 .../org/apache/tika/parser/image/HeifParser.java   |    62 +
 .../org/apache/tika/parser/image/ICNSParser.java   |   128 +
 .../tika/parser/image/ImageMetadataExtractor.java  |   627 +
 .../org/apache/tika/parser/image/PSDParser.java    |   259 +
 .../apache/tika/parser/image/ImageParserTest.java  |   174 +
 .../apache/tika/parser/image/JpegParserTest.java   |   286 +
 .../apache/tika/parser/image/PSDParserTest.java    |    73 +
 .../src/test/resources/test-documents/testBPG.bpg  |   Bin 0 -> 1824 bytes
 .../testBPG_commented_xnviewmp026.bpg              |   Bin 0 -> 12374 bytes
 .../src/test/resources/test-documents/testGIF.gif  |   Bin 0 -> 8495 bytes
 .../test/resources/test-documents/testHEIF.heic    |   Bin 0 -> 13706 bytes
 .../resources/test-documents/testICNS_basic.icns   |   Bin 0 -> 18199 bytes
 .../test/resources/test-documents/testJBIG2.jb2    |   Bin 0 -> 346 bytes
 .../src/test/resources/test-documents/testJPEG.jp2 |   Bin 0 -> 25725 bytes
 .../src/test/resources/test-documents/testJPEG.jpg |   Bin 0 -> 7686 bytes
 .../resources/test-documents/testJPEG_EXIF.jpg     |   Bin 0 -> 16357 bytes
 .../test-documents/testJPEG_oddTagComponent.jpg    |   Bin 0 -> 8330 bytes
 .../src/test/resources/test-documents/testPNG.png  |   Bin 0 -> 17041 bytes
 .../src/test/resources/test-documents/testTIFF.tif |   Bin 0 -> 25584 bytes
 .../test/resources/test-documents/testWEBP.webp    |   Bin 0 -> 3442 bytes
 .../test-documents/testWebp_Alpha_Lossless.webp    |   Bin 0 -> 92312 bytes
 .../test-documents/testWebp_Alpha_Lossy.webp       |   Bin 0 -> 23404 bytes
 .../tika-parser-integration-tests/pom.xml          |   331 +
 .../apache/tika/config/TikaParserConfigTest.java   |   157 +
 .../tika/config/TikaTranslatorConfigTest.java      |    72 +
 .../tika/detect/TestContainerAwareDetector.java    |   572 +
 .../tika/extractor/EmbeddedDocumentUtilTest.java   |    43 +
 .../java/org/apache/tika/mime/TestMimeTypes.java   |  1336 ++
 .../tika/parser/AutoDetectReaderParserTest.java    |   102 +
 .../parser/BouncyCastleDigestingParserTest.java    |   268 +
 .../tika/parser/RecursiveParserWrapperTest.java    |   456 +
 .../apache/tika/parser/TestXMLEntityExpansion.java |   151 +
 .../parser/apple/AppleSingleFileParserTest.java    |    47 +
 .../apache/tika/parser/apple/PListParserTest.java  |    40 +
 .../apache/tika/parser/html/HtmlParserTest.java    |    66 +
 .../tika/parser/microsoft/EMFParserTest.java       |    52 +
 .../tika/parser/microsoft/ExcelParserTest.java     |    32 +
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |    49 +
 .../tika/parser/ocr/TesseractOCRParserTest.java    |   322 +
 .../apache/tika/parser/pkg/CompressParserTest.java |    79 +
 .../org/apache/tika/parser/pkg/GzipParserTest.java |    78 +
 .../org/apache/tika/parser/pkg/TarParserTest.java  |    66 +
 .../org/apache/tika/parser/pkg/ZlibParserTest.java |    46 +
 .../tika/parser/xml/FictionBookParserTest.java     |    47 +
 .../sax/StandardsExtractingContentHandlerTest.java |    53 +
 .../apache/tika/utils/ServiceLoaderUtilsTest.java  |    57 +
 .../tika/config/TIKA-1702-translator-default.xml   |    24 +
 .../config/TIKA-1702-translator-empty-default.xml  |    22 +
 .../tika/config/TIKA-1702-translator-empty.xml     |    20 +
 .../tika/config/TIKA-1708-detector-composite.xml   |    25 +
 ...TIKA-2273-exclude-encoding-detector-default.xml |    29 +
 .../TIKA-2273-parameterize-encoding-detector.xml   |    30 +
 .../org/apache/tika/parser/TIKA-3137-include.xml   |    38 +
 .../CVLKRA-KYC_Download_File_Structure_V3.1.xlsx   |   Bin 0 -> 204480 bytes
 .../test/resources/test-documents/NUTCH-1997.cbor  |    30 +
 .../active_layer_arcss_grid_barrow_alaska_2012.dif |    61 +
 .../test-documents/mock/null_pointer_no_msg.xml}   |    18 +-
 .../resources/test-documents/mock/real_oom.xml}    |    17 +-
 .../resources/test-documents/mock/system_exit.xml} |    18 +-
 .../test-documents/mock/thread_interrupt.xml}      |    18 +-
 .../resources/test-documents/test-columnar.ods     |   Bin 0 -> 12854 bytes
 .../resources/test-documents/test-columnar.xpt     |   Bin 0 -> 4720 bytes
 .../src/test/resources/test-documents/test1.swf    |   Bin 0 -> 21054 bytes
 .../test/resources/test-documents/testAMR-WB.amr   |   Bin 0 -> 3609 bytes
 .../src/test/resources/test-documents/testAPK.apk  |   Bin 0 -> 11740 bytes
 .../resources/test-documents/testBDB_btree_2.db    |   Bin 0 -> 8192 bytes
 .../resources/test-documents/testBDB_btree_3.db    |   Bin 0 -> 8192 bytes
 .../resources/test-documents/testBDB_btree_4.db    |   Bin 0 -> 8192 bytes
 .../resources/test-documents/testBDB_btree_5.db    |   Bin 0 -> 8192 bytes
 .../resources/test-documents/testBDB_hash_2.db     |   Bin 0 -> 12288 bytes
 .../src/test/resources/test-documents/testCERT.pem |     0
 .../src/test/resources/test-documents/testCSS.css  |    48 +
 .../test/resources/test-documents/testDITA.dita    |    34 +
 .../test/resources/test-documents/testDITA2.dita   |    33 +
 .../test/resources/test-documents/testDSAKEY.der   |   Bin
 .../test/resources/test-documents/testDSAKEY.pem   |     0
 .../src/test/resources/test-documents/testEAC3.ac3 |   Bin 0 -> 768 bytes
 .../src/test/resources/test-documents/testEAR.ear  |   Bin 0 -> 1086 bytes
 .../test/resources/test-documents/testECKEY.pem    |     0
 .../test/resources/test-documents/testECPARAMS.pem |     0
 .../src/test/resources/test-documents/testFLAC.oga |   Bin 0 -> 10820 bytes
 .../resources/test-documents/testGRAPHVIZd.dot     |     6 +
 .../resources/test-documents/testGRAPHVIZdc.dot    |     9 +
 .../src/test/resources/test-documents/testHFA.hfa  |   Bin 0 -> 1024 bytes
 .../resources/test-documents/testICalendar.ics     |    15 +
 .../test/resources/test-documents/testINDD.indd    |   Bin 0 -> 880640 bytes
 .../test-documents/testJAVAPROPS.properties        |    22 +
 .../resources/test-documents/testJavaHprofBinary   |   Bin 0 -> 88489 bytes
 .../resources/test-documents/testJavaHprofText     |  2193 ++++
 .../test-documents/testLotus123-lotusftp.wk4       |   Bin 0 -> 6168 bytes
 .../test/resources/test-documents/testLotus123.wk1 |   Bin 0 -> 24291 bytes
 .../test/resources/test-documents/testLotusEml.eml |    71 +
 .../test-documents/testMHTMLFirefox.mhtml          |   455 +
 .../src/test/resources/test-documents/testMKV.mkv  |   Bin 0 -> 82969 bytes
 .../test/resources/test-documents/testMYSQL.MYI    |   Bin 0 -> 1024 bytes
 .../test-documents/testOptionalHyphen.doc          |   Bin 0 -> 22016 bytes
 .../test-documents/testOptionalHyphen.ppt          |   Bin 0 -> 100864 bytes
 .../test-documents/testOptionalHyphen.pptx         |   Bin 0 -> 33173 bytes
 .../test-documents/testOptionalHyphen.rtf          |   158 +
 .../src/test/resources/test-documents/testPICT.pct |   Bin 0 -> 23454 bytes
 .../test-documents/testPKCS17Sig-v4.xml.p7m        |  1606 +++
 .../resources/test-documents/testPKCS17Sig.xml.p7m |  4333 +++++++
 .../test-documents/testPhoneNumberExtractor.odt    |   Bin 0 -> 15244 bytes
 .../src/test/resources/test-documents/testRDF.rdf  |    23 +
 .../resources/test-documents/testSolaris-x86-32    |   Bin 0 -> 6404 bytes
 .../test-documents/testStarOffice-6.0-calc.sxc     |   Bin 0 -> 7406 bytes
 .../test-documents/testStarOffice-6.0-writer.sxw   |   Bin 0 -> 5200 bytes
 .../test/resources/test-documents/testStataDTA.dta |   Bin 0 -> 1207 bytes
 .../test/resources/test-documents/testStataDTA.txt |    15 +
 .../resources/test-documents/testTAR_no_magic.tar  |   Bin 0 -> 156160 bytes
 .../test-documents/testTXTNonASCIIUTF8.txt         |     7 +
 .../test-documents/testThunderbirdEml.eml          |    32 +
 .../test/resources/test-documents/testVORBIS.ogg   |   Bin 0 -> 4241 bytes
 .../test-documents/testVORDrawTemplate.vor         |   Bin 0 -> 29696 bytes
 .../test-documents/testVORWriterTemplate.vor       |   Bin 0 -> 8192 bytes
 .../test/resources/test-documents/testWMV_WMV2.wmv |   Bin 0 -> 554297 bytes
 .../test/resources/test-documents/testWORKS.wps    |   Bin 0 -> 9728 bytes
 .../resources/test-documents/testWORKS2000.wps     |   Bin 0 -> 5120 bytes
 .../test-documents/testWORKSWordProcessor3.0.wps   |   Bin 0 -> 3072 bytes
 .../resources/test-documents/testWebVTT_simple.vtt |    10 +
 .../test-documents/testWindowsMediaMeta.asx        |     6 +
 .../src/test/resources/test-documents/testXDP.xdp  |     5 +
 .../test/resources/test-documents/testXFDF.xfdf    |     7 +
 .../tika-parser-jdbc-commons/pom.xml               |     3 +-
 .../tika-parser-mail-commons/pom.xml               |     3 +-
 .../apache/tika/parser/mailcommons/MailUtil.java   |   116 +
 .../tika/parser/mailcommons/MailUtilTest.java      |    56 +
 .../pom.xml                                        |    45 +-
 .../tika/parser/mail/MailContentHandler.java       |   657 +
 .../org/apache/tika/parser/mail/RFC822Parser.java  |   133 +
 .../apache/tika/parser/mail/RFC822ParserTest.java  |   603 +
 .../mail/tika-config-extract-all-alternatives.xml  |    30 +
 .../test/resources/test-documents/multiline.mbox   |     5 +
 .../src/test/resources/test-documents/quoted.mbox  |     4 +
 .../src/test/resources/test-documents/simple.mbox  |     7 +
 .../test-documents/testEmailWithPNGAtt.eml         |   354 +
 .../resources/test-documents/testGroupWiseEml.eml  |    58 +
 .../src/test/resources/test-documents/testRFC822   |    41 +
 .../resources/test-documents/testRFC822-txt-body   |    35 +
 .../resources/test-documents/testRFC822_base64     |     8 +
 .../resources/test-documents/testRFC822_date_utf8  |     8 +
 .../test-documents/testRFC822_encrypted_zip        |    61 +
 .../test-documents/testRFC822_i18nheaders          |     9 +
 .../resources/test-documents/testRFC822_oddfrom    |  2105 +++
 .../resources/test-documents/testRFC822_quoted     |    13 +
 .../tika-parser-microsoft-module/pom.xml           |     3 +-
 .../parser/microsoft/MSOwnerFileParserTest.java    |    31 +
 .../tika/parser/microsoft/OfficeParserTest.java    |    46 +
 .../tika/parser/microsoft/OldExcelParserTest.java  |   122 +
 .../parser/microsoft/SolidworksParserTest.java     |   189 +
 .../tika/parser/microsoft/VisioParserTest.java     |    51 +
 .../parser/microsoft/chm/TestChmBlockInfo.java     |   116 +
 .../parser/microsoft/chm/TestChmExtractor.java     |    76 +
 .../parser/microsoft/chm/TestChmItsfHeader.java    |   119 +
 .../tika/parser/microsoft/chm/TestChmLzxState.java |    95 +
 .../microsoft/chm/TestChmLzxcControlData.java      |   139 +
 .../microsoft/chm/TestDirectoryListingEntry.java   |    85 +
 .../tika/parser/microsoft/chm/TestPmgiHeader.java  |    44 +
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  1771 +++
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java |   632 +
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |    97 +
 .../ooxml/xwpf/ml2006/Word2006MLParserTest.java    |   202 +
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |   501 +
 .../parser/microsoft/xml/XML2003ParserTest.java    |    93 +
 .../microsoft/ooxml/tika-config-sax-macros.xml     |    34 +
 .../tika/parser/microsoft/rtf/tika-config.xml      |    26 +
 .../tika-config-extract-all-alternatives-msg.xml   |    30 +
 .../tika/parser/microsoft/tika-config-macros.xml   |    32 +
 .../resources/test-documents/EmbeddedDocument.docx |   Bin 0 -> 13219 bytes
 .../resources/test-documents/EmbeddedOutlook.docx  |   Bin 0 -> 113242 bytes
 .../test/resources/test-documents/NullHeader.docx  |   Bin 0 -> 4355 bytes
 .../test/resources/test-documents/chm/IMJPCLE.CHM  |   Bin 0 -> 256718 bytes
 .../test/resources/test-documents/chm/admin.chm    |   Bin 0 -> 49749 bytes
 .../test/resources/test-documents/chm/cmak_ops.CHM |   Bin 0 -> 82895 bytes
 .../resources/test-documents/chm/wmicontrol.CHM    |   Bin 0 -> 32096 bytes
 .../test/resources/test-documents/headerPic.docx   |   Bin 0 -> 16206 bytes
 .../src/test/resources/test-documents/pictures.ppt |   Bin 0 -> 75776 bytes
 .../src/test/resources/test-documents/protect.xlsx |   Bin 0 -> 12968 bytes
 .../resources/test-documents/protectedFile.xlsx    |   Bin 0 -> 12968 bytes
 .../resources/test-documents/test-columnar.xlsb    |   Bin 0 -> 9691 bytes
 .../src/test/resources/test-documents/test.doc     |   Bin 0 -> 9216 bytes
 .../resources/test-documents/testAccess_V1997.mdb  |   Bin 0 -> 118784 bytes
 .../test-documents/testBinControlWord.rtf          |     2 +
 .../src/test/resources/test-documents/testChm.chm  |   Bin 0 -> 186259 bytes
 .../src/test/resources/test-documents/testChm3.chm |   Bin 0 -> 900481 bytes
 .../test/resources/test-documents/testComment.ppt  |   Bin 0 -> 101376 bytes
 .../test/resources/test-documents/testComment.pptx |   Bin 0 -> 34979 bytes
 .../test/resources/test-documents/testDOTM.dotm    |   Bin 0 -> 65527 bytes
 .../resources/test-documents/testDocumentLink.doc  |   Bin 0 -> 812032 bytes
 .../resources/test-documents/testEXCEL-charts.xls  |   Bin 0 -> 15360 bytes
 .../resources/test-documents/testEXCEL.strict.xlsx |   Bin 0 -> 10006 bytes
 .../test/resources/test-documents/testEXCEL_4.xls  |   Bin 0 -> 39942 bytes
 .../testEXCEL_WORKBOOK_in_capitals.xls             |   Bin 0 -> 64512 bytes
 .../test-documents/testEXCEL_big_numbers.xls       |   Bin 0 -> 26112 bytes
 .../test-documents/testEXCEL_custom_props.xlsx     |   Bin 0 -> 9230 bytes
 .../test-documents/testEXCEL_dateFormats.xlsx      |   Bin 0 -> 8766 bytes
 .../test-documents/testEXCEL_diagramData.xlsx      |   Bin 0 -> 16654 bytes
 .../test-documents/testEXCEL_embeddedPDF_mac.xlsx  |   Bin 0 -> 80578 bytes
 .../testEXCEL_embeddedPDF_windows.xls              |   Bin 0 -> 61952 bytes
 .../resources/test-documents/testEXCEL_embeded.xls |   Bin 0 -> 303104 bytes
 .../test-documents/testEXCEL_headers_footers.xls   |   Bin 0 -> 33792 bytes
 .../test-documents/testEXCEL_hyperlinks.xls        |   Bin 0 -> 29696 bytes
 .../testEXCEL_labels-govdocs-515858.xls            |   Bin 0 -> 57856 bytes
 .../testEXCEL_macro_enabled_template.xltm          |   Bin 0 -> 8619 bytes
 .../test-documents/testEXCEL_poi-61034.xlsx        |   Bin 0 -> 32774 bytes
 .../testEXCEL_protected_passtika.xls               |   Bin 0 -> 17408 bytes
 .../testEXCEL_protected_passtika.xlsx              |   Bin 0 -> 12800 bytes
 .../testEXCEL_protected_passtika_2.xlsx            |   Bin 0 -> 15872 bytes
 .../test-documents/testEXCEL_template.xltx         |   Bin 0 -> 8589 bytes
 .../test-documents/testExcel_embeddedPDF.xlsx      |   Bin 0 -> 25602 bytes
 .../resources/test-documents/testException1.doc    |   Bin 0 -> 49152 bytes
 .../test-documents/testMSChart-govdocs-428996.xls  |   Bin 0 -> 35328 bytes
 .../test-documents/testMSChart-govdocs-428996.xlsx |   Bin 0 -> 17112 bytes
 .../src/test/resources/test-documents/testMSG.msg  |   Bin 0 -> 20480 bytes
 .../test-documents/testMSG_Appointment.msg         |   Bin 0 -> 30208 bytes
 .../test/resources/test-documents/testMSG_Post.msg |   Bin 0 -> 21504 bytes
 .../resources/test-documents/testMSG_chinese.msg   |   Bin 0 -> 48129 bytes
 .../resources/test-documents/testMSG_forwarded.msg |   Bin 0 -> 25600 bytes
 .../test/resources/test-documents/testMSOwnerFile  |   Bin 0 -> 162 bytes
 .../test/resources/test-documents/testOneNote.one  |   Bin 0 -> 30288 bytes
 .../test/resources/test-documents/testOneNote1.one |   Bin 0 -> 360280 bytes
 .../test-documents/testOneNote2007OrEarlier1.one   |   Bin 0 -> 1246998 bytes
 .../test/resources/test-documents/testOneNote3.one |   Bin 0 -> 35344 bytes
 .../src/test/resources/test-documents/testPPM.ppm  |     4 +
 .../src/test/resources/test-documents/testPPT.potm |   Bin 0 -> 40102 bytes
 .../src/test/resources/test-documents/testPPT.ppsm |   Bin 0 -> 36545 bytes
 .../src/test/resources/test-documents/testPPT.ppsx |   Bin 0 -> 36521 bytes
 .../src/test/resources/test-documents/testPPT.thmx |   Bin 0 -> 42485 bytes
 .../testPPTX_overlappingRelations.pptx             |   Bin 0 -> 38135 bytes
 .../resources/test-documents/testPPT_2imgs.ppt     |   Bin 0 -> 124928 bytes
 .../resources/test-documents/testPPT_2imgs.pptx    |   Bin 0 -> 59246 bytes
 .../test-documents/testPPT_EmbeddedPDF.ppt         |   Bin 0 -> 187392 bytes
 .../test-documents/testPPT_EmbeddedPDF.pptx        |   Bin 0 -> 108637 bytes
 .../resources/test-documents/testPPT_comment.pptx  |   Bin 0 -> 30939 bytes
 .../test-documents/testPPT_custom_props.ppt        |   Bin 0 -> 104960 bytes
 .../test-documents/testPPT_diagramData.pptx        |   Bin 0 -> 48793 bytes
 .../resources/test-documents/testPPT_embedded2.ppt |   Bin 0 -> 92160 bytes
 .../test-documents/testPPT_embeddedMP3.pptx        |   Bin 0 -> 84434 bytes
 .../testPPT_embedded_two_slides.pptx               |   Bin 0 -> 255364 bytes
 .../resources/test-documents/testPPT_embeded.ppt   |   Bin 0 -> 224768 bytes
 .../resources/test-documents/testPPT_embeded.pptx  |   Bin 0 -> 202969 bytes
 .../resources/test-documents/testPPT_groups.ppt    |   Bin 0 -> 161792 bytes
 .../resources/test-documents/testPPT_macros.ppt    |   Bin 0 -> 88064 bytes
 .../test-documents/testPPT_masterFooter.pptx       |   Bin 0 -> 35128 bytes
 .../test-documents/testPPT_masterText.pptx         |   Bin 0 -> 32270 bytes
 .../test-documents/testPPT_masterText2.ppt         |   Bin 0 -> 102912 bytes
 .../test-documents/testPPT_masterText2.pptx        |   Bin 0 -> 32291 bytes
 .../test-documents/testPPT_oleWorkbook.ppt         |   Bin 0 -> 98304 bytes
 .../test-documents/testPPT_oleWorkbook.pptx        |   Bin 0 -> 44001 bytes
 .../test-documents/testPPT_protected_passtika.pptx |   Bin 0 -> 41472 bytes
 .../resources/test-documents/testPPT_various.ppt   |   Bin 0 -> 160768 bytes
 .../resources/test-documents/testPPT_various.pptx  |   Bin 0 -> 56659 bytes
 .../resources/test-documents/testPROJECT2007.mpp   |   Bin 0 -> 147968 bytes
 .../test-documents/testPST_variousBodyTypes.pst    |   Bin 0 -> 271360 bytes
 .../resources/test-documents/testPUBLISHER.pub     |   Bin 0 -> 65536 bytes
 .../resources/test-documents/testRTFBoldItalic.rtf |   164 +
 .../resources/test-documents/testRTFBoldPlain.rtf  |    17 +
 .../test-documents/testRTFEmbeddedFiles.rtf        |  6856 ++++++++++
 .../resources/test-documents/testRTFHyperlink.rtf  |   598 +
 .../test-documents/testRTFIgnoredControlWord.rtf   |    17 +
 .../test-documents/testRTFInvalidUnicode.rtf       |    11 +
 .../test-documents/testRTFListMicrosoftWord.rtf    |   227 +
 .../resources/test-documents/testRTFTIKA_2883.rtf  |   Bin 0 -> 1526 bytes
 .../test-documents/testRTFTableCellSeparation2.rtf |     3 +
 ...stRTFUnicodeUCNControlWordCharacterDoubling.rtf |     8 +
 .../testRTFWord2010CzechCharacters.rtf             |   190 +
 .../testRTFWordPadCzechCharacters.rtf              |     5 +
 .../test/resources/test-documents/testVISIO.vsdm   |   Bin 0 -> 32360 bytes
 .../test/resources/test-documents/testVISIO.vssm   |   Bin 0 -> 32358 bytes
 .../test/resources/test-documents/testVISIO.vssx   |   Bin 0 -> 32349 bytes
 .../test/resources/test-documents/testVISIO.vstx   |   Bin 0 -> 32350 bytes
 .../test/resources/test-documents/testWINMAIL.dat  |   Bin 0 -> 66276 bytes
 .../src/test/resources/test-documents/testWMF.wmf  |   Bin 0 -> 51590 bytes
 .../resources/test-documents/testWMF_charset.wmf   |   Bin 0 -> 9316 bytes
 .../test/resources/test-documents/testWORD2003.xml |  2542 ++++
 .../resources/test-documents/testWORD_1img.docx    |   Bin 0 -> 8325 bytes
 .../resources/test-documents/testWORD_3imgs.doc    |   Bin 0 -> 36352 bytes
 .../test-documents/testWORD_boldHyperlink.docx     |   Bin 0 -> 12382 bytes
 .../resources/test-documents/testWORD_charts.docx  |   Bin 0 -> 15586 bytes
 .../testWORD_closingSmartQInHyperLink.doc          |   Bin 0 -> 26624 bytes
 .../test-documents/testWORD_custom_props.docx      |   Bin 0 -> 13942 bytes
 .../test-documents/testWORD_docSecurity.docx       |   Bin 0 -> 12861 bytes
 .../test-documents/testWORD_embedded_pdf.doc       |   Bin 0 -> 1491456 bytes
 .../test-documents/testWORD_embedded_pics.docx     |   Bin 0 -> 52399 bytes
 .../test-documents/testWORD_header_hyperlink.doc   |   Bin 0 -> 22528 bytes
 .../resources/test-documents/testWORD_macros.doc   |   Bin 0 -> 38400 bytes
 .../resources/test-documents/testWORD_macros.docm  |   Bin 0 -> 17322 bytes
 .../testWORD_missing_ooxml_bean1.docx              |   Bin 0 -> 17913 bytes
 .../test-documents/testWORD_multi_authors.doc      |   Bin 0 -> 22528 bytes
 .../test-documents/testWORD_numbered_list.doc      |   Bin 0 -> 44032 bytes
 .../testWORD_override_list_numbering.doc           |   Bin 0 -> 56320 bytes
 .../testWORD_override_list_numbering.docx          |   Bin 0 -> 15746 bytes
 .../resources/test-documents/testWORD_phonetic.doc |   Bin 0 -> 27136 bytes
 .../test-documents/testWORD_phonetic.docx          |   Bin 0 -> 12523 bytes
 .../resources/test-documents/testWORD_signed.docx  |   Bin 0 -> 18245 bytes
 .../testWORD_totalTimeOutOfRange.docx              |   Bin 0 -> 11047 bytes
 .../resources/test-documents/testWORD_various.doc  |   Bin 0 -> 17408 bytes
 .../test-documents/testWORKSSpreadsheet7.0.xlr     |   Bin 0 -> 10752 bytes
 .../test/resources/test-documents/testWordArt.pptx |   Bin 0 -> 37792 bytes
 .../test-documents/test_embedded_zip.pptx          |   Bin 0 -> 345027 bytes
 .../test-documents/test_list_override.rtf          |    21 +
 .../test-documents/test_recursive_embedded.docx    |   Bin 0 -> 27082 bytes
 .../testsolidworksDrawing2014SP0.SLDDRW            |   Bin 0 -> 201216 bytes
 .../testsolidworksPart2014SP0.SLDPRT               |   Bin 0 -> 1043456 bytes
 .../tika-parser-miscoffice-module/pom.xml          |     3 +-
 .../java/org/apache/tika/parser/dbf/DBFCell.java   |   147 +
 .../org/apache/tika/parser/dbf/DBFFileHeader.java  |   144 +
 .../apache/tika/parser/epub/EpubContentParser.java |    56 +
 .../org/apache/tika/parser/epub/EpubParser.java    |   496 +
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java |   514 +
 .../apache/tika/parser/mif/MIFContentHandler.java  |   122 +
 .../org/apache/tika/parser/mif/MIFExtractor.java   |   179 +
 .../parser/odf/NSNormalizerContentHandler.java     |    99 +
 .../tika/parser/odf/OpenDocumentContentParser.java |    70 +
 .../tika/parser/odf/OpenDocumentMacroHandler.java  |    60 +
 .../tika/parser/wordperfect/QPWTextExtractor.java  |   225 +
 .../tika/parser/wordperfect/QuattroProParser.java  |    72 +
 .../tika/parser/wordperfect/WP5Charsets.java       |   203 +
 .../wordperfect/WPDocumentAreaExtractor.java       |    88 +
 .../tika/parser/wordperfect/WPInputStream.java     |   224 +
 .../parser/wordperfect/WPPrefixAreaExtractor.java  |    67 +
 .../org/apache/tika/parser/dbf/DBFParserTest.java  |   150 +
 .../apache/tika/parser/hwp/HwpV5ParserTest.java    |    83 +
 .../tika/parser/wordperfect/QuattroProTest.java    |    48 +
 .../tika/parser/wordperfect/WPInputStreamTest.java |   127 +
 .../org/apache/tika/parser/epub/tika-config.xml    |    26 +
 .../src/test/resources/test-documents/testDBF.dbf  |   Bin 0 -> 890 bytes
 .../resources/test-documents/testDBF_gb18030.dbf   |   Bin 0 -> 144 bytes
 .../test/resources/test-documents/testFooter.ods   |   Bin 0 -> 7207 bytes
 .../resources/test-documents/testFramemakerMif.mif | 12955 +++++++++++++++++++
 .../test/resources/test-documents/testHWP_3.0.hwp  |   Bin 0 -> 9287 bytes
 .../resources/test-documents/testODPMacro.fodp     |   781 ++
 .../test/resources/test-documents/testODPMacro.odp |   Bin 0 -> 14505 bytes
 .../test/resources/test-documents/testODP_NPE.odp  |   Bin 0 -> 431290 bytes
 .../resources/test-documents/testODTMacro.fodt     |   633 +
 .../test/resources/test-documents/testODTMacro.odt |   Bin 0 -> 30809 bytes
 .../resources/test-documents/testODTStyles2.odt    |   Bin 0 -> 17383 bytes
 .../resources/test-documents/testODTStyles3.odt    |   Bin 0 -> 17140 bytes
 .../resources/test-documents/testOpenOffice2.odf   |   Bin 0 -> 10977 bytes
 .../test/resources/test-documents/testQuattro.wq2  |   Bin 0 -> 7938 bytes
 .../test/resources/test-documents/testStyles.odt   |   Bin 0 -> 11663 bytes
 .../test-documents/testWordPerfect_42.doc          |   Bin 0 -> 725 bytes
 .../test-documents/testWordPerfect_5_0.wp          |   Bin 0 -> 9915 bytes
 .../test-documents/testWordPerfect_5_1.wp          |   Bin 0 -> 18267 bytes
 .../resources/test-documents/testiBooks.ibooks     |   Bin 0 -> 970636 bytes
 .../tika-parser-news-module/pom.xml                |     3 +-
 .../apache/tika/parser/iptc/IptcAnpaParser.java    |   808 ++
 .../services/org.apache.tika.parser.Parser         |    17 +
 .../test/resources/test-documents/testATOM.atom    |    27 +
 .../tika-parser-ocr-module/pom.xml                 |     3 +-
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |   689 +
 .../services/org.apache.tika.parser.Parser         |    15 +
 .../tika/parser/ocr/TesseractOCRConfig.properties  |    36 +
 .../org/apache/tika/config/TIKA-2705-tesseract.xml |    33 +
 .../src/test/resources/test-documents/testOCR.docx |   Bin 0 -> 62041 bytes
 .../src/test/resources/test-documents/testOCR.jpg  |   Bin 0 -> 3408 bytes
 .../src/test/resources/test-documents/testOCR.pptx |   Bin 0 -> 78550 bytes
 .../tika-parser-pdf-module/pom.xml                 |     3 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  1001 ++
 .../org/apache/tika/parser/pdf/AccessChecker.java  |   100 +
 .../apache/tika/parser/pdf/AccessCheckerTest.java  |   137 +
 .../tika/parser/pdf/PDFPreflightParserTest.java    |    58 +
 .../org/apache/tika/parser/pdf/tika-config.xml     |    26 +
 .../tika/parser/pdf/tika-preflight-config.xml      |    25 +
 .../resources/test-documents/testAnnotations.pdf   |   Bin 0 -> 18580 bytes
 .../test-documents/testOptionalHyphen.pdf          |   Bin 0 -> 44954 bytes
 .../test-documents/testPDFFileEmbInAnnotation.pdf  |   Bin 0 -> 97211 bytes
 .../test-documents/testPDFTripleLangTitle.pdf      |   Bin 0 -> 1719 bytes
 .../test-documents/testPDFTwoTextBoxes.pdf         |   Bin 0 -> 57100 bytes
 .../resources/test-documents/testPDFVarious.pdf    |   Bin 0 -> 205491 bytes
 .../testPDF_Version.11.x.PDFA-1b.pdf               |   Bin 0 -> 23081 bytes
 .../test-documents/testPDF_Version.6.x.pdf         |   Bin 0 -> 5903 bytes
 .../test-documents/testPDF_Version.8.x.pdf         |   Bin 0 -> 5903 bytes
 .../test-documents/testPDF_bad_page_303226.pdf     |   Bin 0 -> 138027 bytes
 .../resources/test-documents/testPDF_bookmarks.pdf |   Bin 0 -> 9487 bytes
 .../test-documents/testPDF_diffTitles.pdf          |   261 +
 ...DF_no_extract_yes_accessibility_owner_empty.pdf |    87 +
 .../resources/test-documents/testPDF_protected.pdf |   Bin 0 -> 506064 bytes
 .../test-documents/testPDF_twoAuthors.pdf          |   Bin 0 -> 12628 bytes
 .../test-documents/testPopupAnnotation.pdf         |   Bin 0 -> 9081 bytes
 .../tika-parser-pkg-module/pom.xml                 |     3 +-
 .../apache/tika/parser/pkg/AbstractPkgTest.java    |    90 +
 .../apache/tika/parser/pkg/CompressParserTest.java |    72 +
 .../apache/tika/parser/pkg/PackageParserTest.java  |    81 +
 .../org/apache/tika/parser/pkg/RarParserTest.java  |   122 +
 .../org/apache/tika/parser/pkg/TarParserTest.java  |    77 +
 .../src/test/resources/test-documents/TIKA-216.tgz |   Bin 0 -> 1270 bytes
 .../resources/test-documents/full_encrypted.7z     |   Bin 0 -> 198 bytes
 .../src/test/resources/test-documents/moby.zip     |   Bin 0 -> 606033 bytes
 .../src/test/resources/test-documents/quine.gz     |   Bin 0 -> 204 bytes
 .../test-documents/test-documents-enc.rar          |   Bin 0 -> 68636 bytes
 .../test-documents/test-documents-spanned.z01      |   Bin 0 -> 65536 bytes
 .../test-documents/test-documents-spanned.zip      |   Bin 0 -> 3488 bytes
 .../resources/test-documents/test-documents.cpio   |   Bin 0 -> 116224 bytes
 .../resources/test-documents/test-zip-of-zip.zip   |   Bin 0 -> 299 bytes
 .../test/resources/test-documents/testARofText.ar  |     5 +
 .../resources/test-documents/testJAR_with_HTML.jar |   Bin 0 -> 5594 bytes
 .../src/test/resources/test-documents/testLZMA_oom |   Bin 0 -> 19 bytes
 .../src/test/resources/test-documents/testSVG.svgz |   Bin 0 -> 222 bytes
 .../resources/test-documents/testSnappy-framed.sz  |   Bin 0 -> 58586 bytes
 .../test/resources/test-documents/testZSTD.zstd    |   Bin 0 -> 143 bytes
 .../src/test/resources/test-documents/testZ_oom.Z  |     1 +
 .../testZip_with_DataDescriptor2.zip               |   Bin 0 -> 1987 bytes
 .../tika-parser-text-module/pom.xml                |     3 +-
 .../apache/tika/parser/strings/StringsConfig.java  |   187 +
 .../tika/parser/strings/StringsEncoding.java       |    45 +
 .../org/apache/tika/parser/txt/CharsetMatch.java   |   267 +
 .../apache/tika/parser/txt/CharsetRecog_sbcs.java  |  1356 ++
 .../tika/parser/txt/UniversalEncodingListener.java |   113 +
 .../services/org.apache.tika.parser.Parser         |    22 +
 .../parser/strings/Latin1StringsParserTest.java    |    69 +
 .../tika/parser/strings/StringsConfigTest.java     |    61 +
 .../resources/test-documents/english.cp500.txt     |     1 +
 .../resources/test-documents/multi-language.txt    |    38 +-
 .../src/test/resources/test-documents/resume.html  |    73 +
 .../resources/test-documents/russian.cp866.txt     |    12 +-
 .../resources/test-documents/testTXT_win-1252.txt  |     1 +
 .../resources/test-documents/testVCalendar.vcs     |    10 +
 .../test-properties/StringsConfig-full.properties  |    18 +
 .../tika-parser-xml-module/pom.xml                 |     3 +-
 .../tika/parser/xliff/XLIFF12ContentHandler.java   |   133 +
 .../org/apache/tika/parser/xliff/XLZParser.java    |   146 +
 .../org/apache/tika/parser/xml/DcXMLParser.java    |    60 +
 .../tika/parser/xml/ElementMetadataHandler.java    |   241 +
 .../apache/tika/parser/xml/FictionBookParser.java  |   114 +
 .../apache/tika/parser/xml/MetadataHandler.java    |    85 +
 .../java/org/apache/tika/parser/xml/XMLParser.java |    92 +
 .../tika/parser/xliff/XLIFF12ParserTest.java       |    54 +
 .../tika/parser/xml/FictionBookParserTest.java     |    42 +
 .../parser/xml/TextAndAttributeXMLParserTest.java  |    53 +
 .../test/resources/test-documents/testXLIFF12.xlz  |   Bin 0 -> 1004 bytes
 .../src/test/resources/test-documents/testXXE.xml  |     4 +
 .../tika-parser-xmp-commons/pom.xml                |     3 +-
 .../test-documents/testJPEG_commented.jpg          |   Bin 0 -> 13325 bytes
 .../testJPEG_commented_pspcs2mac.jpg               |   Bin 0 -> 26173 bytes
 .../pom.xml                                        |    14 +-
 .../tika/detect/zip/CompressorConstants.java       |    77 +
 .../detect/zip/DeprecatedZipContainerDetector.java |    39 +
 .../org/apache/tika/detect/zip/IPADetector.java    |   119 +
 .../org/apache/tika/detect/zip/KMZDetector.java    |    98 +
 .../apache/tika/detect/zip/StarOfficeDetector.java |   144 +
 .../tika/detect/zip/StreamingDetectContext.java    |    78 +
 .../tika/detect/zip/ZipContainerDetector.java      |    66 +
 .../services/org.apache.tika.detect.Detector       |    16 +
 .../org/apache/tika/detect/zip/ZipParserTest.java  |    45 +
 565 files changed, 71447 insertions(+), 206 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml
index 4a8c390..1b836af 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml
@@ -21,7 +21,7 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
new file mode 100644
index 0000000..a0b8a3f
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Parser that strips the header off of AppleSingle and AppleDouble
+ * files.
+ * <p>
+ * See <a href="http://kaiser-edv.de/documents/AppleSingle_AppleDouble.pdf">spec document</a>.
+ */
+public class AppleSingleFileParser extends AbstractParser {
+
+    private static final int MAX_FIELD_LENGTH = 1_073_741_824;
+    /**
+     * Entry types
+     */
+    private static final int DATA_FORK = 1;
+    private static final int RESOURCE_FORK = 2;
+    private static final int REAL_NAME = 3;
+    private static final int COMMENT = 4;
+    private static final int ICON_BW = 5;
+    private static final int ICON_COLOR = 6;
+    //7?!
+    private static final int FILE_DATES_INFO = 8;
+    private static final int FINDER_INFO = 9;
+    private static final int MACINTOSH_FILE_INFO = 10;
+    private static final int PRODOS_FILE_INFO = 11;
+    private static final int MSDOS_FILE_INFO = 12;
+    private static final int SHORT_NAME = 13;
+    private static final int AFP_FILE_INFO = 14;
+    private static final int DIRECTORY_ID = 15;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("applefile"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        short numEntries = readThroughNumEntries(stream);
+        long bytesRead = 26;
+        List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
+        bytesRead += 12*numEntries;
+        Metadata embeddedMetadata = new Metadata();
+        bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
+        FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        if (contentFieldInfo != null) {
+            long diff = contentFieldInfo.offset-bytesRead;
+            IOUtils.skipFully(stream, diff);
+            if (ex.shouldParseEmbedded(embeddedMetadata)) {
+                // TODO: we should probably add a readlimiting wrapper around this
+                // stream to ensure that not more than contentFieldInfo.length bytes
+                // are read
+                ex.parseEmbedded(new CloseShieldInputStream(stream),
+                        xhtml, embeddedMetadata, false);
+            }
+        }
+        xhtml.endDocument();
+
+    }
+
+    private FieldInfo getContentFieldInfo(List<FieldInfo> fieldInfoList) {
+        for (FieldInfo fieldInfo : fieldInfoList) {
+            if (fieldInfo.entryId == 1) {
+                return fieldInfo;
+            }
+        }
+        return null;
+    }
+
+    private long processFieldEntries(InputStream stream, List<FieldInfo> fieldInfoList,
+                                     Metadata embeddedMetadata, long bytesRead) throws IOException, TikaException {
+        byte[] buffer = null;
+        for (FieldInfo f : fieldInfoList) {
+            long diff = f.offset - bytesRead;
+            //just in case
+            IOUtils.skipFully(stream, diff);
+            bytesRead += diff;
+            if (f.entryId == REAL_NAME) {
+                if (f.length > MAX_FIELD_LENGTH) {
+                    throw new TikaMemoryLimitException(f.length, MAX_FIELD_LENGTH);
+                }
+                buffer = new byte[(int)f.length];
+                IOUtils.readFully(stream, buffer);
+                bytesRead += f.length;
+                String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
+                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
+            } else if (f.entryId != DATA_FORK) {
+                IOUtils.skipFully(stream, f.length);
+                bytesRead += f.length;
+            }
+        }
+        return bytesRead;
+    }
+
+
+    private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries) throws IOException, TikaException {
+        //this is probably overkill.  I'd hope that these were already
+        //in order.  This ensures it.
+        List<FieldInfo> fieldInfoList = new ArrayList<>(numEntries);
+        for (int i = 0; i < numEntries; i++) {
+            //convert 32-bit unsigned ints to longs
+            fieldInfoList.add(
+                    new FieldInfo(
+                            EndianUtils.readUIntBE(stream), //entry id
+                            EndianUtils.readUIntBE(stream), //offset
+                            EndianUtils.readUIntBE(stream) //length
+                    )
+            );
+        }
+        if (fieldInfoList.size() == 0) {
+            throw new TikaException("AppleSingleFile missing field info");
+        }
+        //make absolutely sure these are in order!
+        fieldInfoList.sort(Comparator.comparingLong(fieldInfo -> fieldInfo.offset));
+        return fieldInfoList;
+    }
+
+    //read through header until you hit the number of entries
+    private short readThroughNumEntries(InputStream stream) throws TikaException, IOException {
+        //mime
+        EndianUtils.readIntBE(stream);
+        //version
+        long version = EndianUtils.readIntBE(stream);
+        if (version != 0x00020000) {
+            throw new TikaException("Version should have been 0x00020000, but was:"+version);
+        }
+        IOUtils.skipFully(stream, 16);//filler
+        return EndianUtils.readShortBE(stream);//number of entries
+    }
+
+    private class FieldInfo {
+
+        private final long entryId;
+        private final long offset;
+        private final long length;
+
+        private FieldInfo(long entryId, long offset, long length) {
+            this.entryId = entryId;
+            this.offset = offset;
+            this.length = length;
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/BPListDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
new file mode 100644
index 0000000..731e88e
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/BPListDetector.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import com.dd.plist.NSDictionary;
+import com.dd.plist.NSObject;
+import com.dd.plist.PropertyListFormatException;
+import com.dd.plist.PropertyListParser;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Detector for BPList with utility functions for PList.
+ *
+ * Without significant refactoring, this can't easily work as a true
+ * detector on plist subtypes.  Rather, for now, we require the file to be
+ * parsed and then the parser adds the subtype for xml-based plists.
+ * @since 1.25
+ */
+public class BPListDetector implements Detector {
+
+    //xml versions
+    static MediaType MEMGRAPH = MediaType.application("x-plist-memgraph");
+    static MediaType WEBARCHIVE = MediaType.application("x-plist-webarchive");
+    static MediaType PLIST = MediaType.application("x-plist");
+    static MediaType ITUNES = MediaType.application("x-plist-itunes");
+
+
+    //binary versions
+    static MediaType BMEMGRAPH = MediaType.application("x-bplist-memgraph");
+    static MediaType BWEBARCHIVE = MediaType.application("x-bplist-webarchive");
+    static MediaType BPLIST = MediaType.application("x-bplist");
+    static MediaType BITUNES = MediaType.application("x-bplist-itunes");
+
+    private static Map<MediaType, MediaType> BINARY_TO_XML = new HashMap<>();
+
+    static {
+        BINARY_TO_XML.put(BMEMGRAPH, MEMGRAPH);
+        BINARY_TO_XML.put(BWEBARCHIVE, WEBARCHIVE);
+        BINARY_TO_XML.put(BPLIST, PLIST);
+        BINARY_TO_XML.put(BITUNES, ITUNES);
+    }
+
+    /**
+     * @param input    input stream must support reset
+     * @param metadata input metadata for the document
+     * @return
+     * @throws IOException
+     */
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+        input.mark(8);
+        byte[] bytes = new byte[8];
+
+        try {
+            int read = IOUtils.read(input, bytes);
+            if (read < 6) {
+                return MediaType.OCTET_STREAM;
+            }
+        } catch (IOException e) {
+            return MediaType.OCTET_STREAM;
+        } finally {
+            input.reset();
+        }
+
+        int i = 0;
+        if (bytes[i++] != 'b' || bytes[i++] != 'p'
+                || bytes[i++] != 'l' || bytes[i++] != 'i'
+                || bytes[i++] != 's' || bytes[i++] != 't') {
+            return MediaType.OCTET_STREAM;
+        }
+        //TODO: extract the version with the next two bytes if they were read
+        NSObject rootObj = null;
+        try {
+            if (input instanceof TikaInputStream && ((TikaInputStream) input).hasFile()) {
+                rootObj = PropertyListParser.parse(((TikaInputStream) input).getFile());
+            } else {
+                rootObj = PropertyListParser.parse(input);
+            }
+            if (input instanceof TikaInputStream) {
+                ((TikaInputStream) input).setOpenContainer(rootObj);
+            }
+        } catch (PropertyListFormatException | ParseException | ParserConfigurationException | SAXException e) {
+            throw new IOExceptionWithCause("problem parsing root", e);
+        }
+        if (rootObj instanceof NSDictionary) {
+            return detectOnKeys(((NSDictionary) rootObj).getHashMap().keySet());
+        }
+        return BPLIST;
+    }
+
+    static MediaType detectOnKeys(Set<String> keySet) {
+        if (keySet.contains("nodes") && keySet.contains("edges")
+                && keySet.contains("graphEncodingVersion")) {
+            return BMEMGRAPH;
+        } else if (keySet.contains("WebMainResource")){ //&& keySet.contains("WebSubresources") should we require this?
+            return BWEBARCHIVE;
+        } else if (keySet.contains("Playlists") && keySet.contains("Tracks")
+                && keySet.contains("Music Folder")) {
+            return BITUNES;
+        } //if it contains $archiver and $objects, it is a bplist inside a webarchive
+        return BPLIST;
+    }
+
+    static MediaType detectXMLOnKeys(Set<String> keySet) {
+        return BINARY_TO_XML.get(detectOnKeys(keySet));
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
new file mode 100644
index 0000000..2ffbf56
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.XmlRootExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.xml.namespace.QName;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
+ * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
+ * 
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
+ * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
+ * </ol>
+ */
+public class IWorkPackageParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -2160322853809682372L;
+
+    /**
+     * Which files within an iWork file contain the actual content?
+     */
+    public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
+            new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
+    );
+    /**
+     * All iWork files contain one of these, so we can detect based on it
+     */
+    public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
+    
+    public enum IWORKDocumentType {
+       KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
+       NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
+       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
+       
+       private final String namespace;
+       private final String part;
+       private final MediaType type;
+       
+       IWORKDocumentType(String namespace, String part, MediaType type) {
+          this.namespace = namespace;
+          this.part = part;
+          this.type = type;
+       }
+       
+       public String getNamespace() {
+          return namespace;
+       }
+
+       public String getPart() {
+          return part;
+       }
+
+       public MediaType getType() {
+          return type;
+       }
+
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
+          try {
+             if (entry == null) {
+                 return null;
+             }
+
+              try (InputStream stream = zip.getInputStream(entry)) {
+                  return detectType(stream);
+              }
+          } catch (IOException e) {
+             return null;
+          }
+       }
+       
+       public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
+          if (entry == null) {
+              return null;
+          }
+
+          return detectType(zip);
+       }
+       
+       public static IWORKDocumentType detectType(InputStream stream) {
+          QName qname = new XmlRootExtractor().extractRootElement(stream);
+          if (qname != null) {
+             String uri = qname.getNamespaceURI();
+             String local = qname.getLocalPart();
+            
+             for (IWORKDocumentType type : values()) {
+                if(type.getNamespace().equals(uri) && 
+                   type.getPart().equals(local)) {
+                   return type;
+                }
+             }
+          } else {
+             // There was a problem with extracting the root type
+             // Password Protected iWorks files are funny, but we can usually
+             //  spot them because they encrypt part of the zip stream 
+             try {
+                stream.read();
+             } catch(UnsupportedZipFeatureException e) {
+                // Compression field was likely encrypted
+                return ENCRYPTED;
+             } catch(Exception ignored) {
+             }
+          }
+          return null;
+       }
+    }
+
+    /**
+     * This parser handles all iWorks formats.
+     */
+    private final static Set<MediaType> supportedTypes =
+         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("vnd.apple.iwork"),
+                IWORKDocumentType.KEYNOTE.getType(),
+                IWORKDocumentType.NUMBERS.getType(),
+                IWORKDocumentType.PAGES.getType()
+         )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveEntry entry = zip.getNextZipEntry();
+
+        while (entry != null) {
+            if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
+                entry = zip.getNextZipEntry();
+                continue;
+            }
+
+            InputStream entryStream = new BufferedInputStream(zip, 4096);
+            entryStream.mark(4096);
+            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            entryStream.reset();
+            
+            if(type != null) {
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+               ContentHandler contentHandler;
+               
+               switch(type) {
+               case KEYNOTE:
+                  contentHandler = new KeynoteContentHandler(xhtml, metadata);
+                  break;
+               case NUMBERS:
+                  contentHandler = new NumbersContentHandler(xhtml, metadata);
+                  break;
+               case PAGES:
+                  contentHandler = new PagesContentHandler(xhtml, metadata);
+                  break;
+               case ENCRYPTED:
+                   // We can't do anything for the file right now
+                   contentHandler = null;
+                   break;
+               default:
+                  throw new TikaException("Unhandled iWorks file " + type);
+               }
+
+               metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
+               xhtml.startDocument();
+                if (contentHandler != null) {
+                    XMLReaderUtils.parseSAX(
+                            new CloseShieldInputStream(entryStream),
+                            new OfflineContentHandler(contentHandler),
+                            context
+                    );
+                }
+               xhtml.endDocument();
+            }
+            
+            entry = zip.getNextZipEntry();
+        }
+        // Don't close the zip InputStream (TIKA-1117).
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
new file mode 100644
index 0000000..40b3d60
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class KeynoteContentHandler extends DefaultHandler {
+
+    public final static String PRESENTATION_WIDTH = "slides-width";
+    public final static String PRESENTATION_HEIGHT = "slides-height";
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSlide = false;
+    private boolean inTheme = false;
+    private boolean inTitle = false;
+    private boolean inBody = false;
+    private String tableId;
+    private Integer numberOfColumns = null;
+    private Integer currentColumn = null;
+
+    private boolean inMetadata = false;
+    private boolean inMetaDataTitle = false;
+    private boolean inMetaDataAuthors = false;
+
+    private boolean inParsableText = false;
+
+    private int numberOfSlides = 0;
+
+    KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Office.SLIDE_COUNT, String.valueOf(numberOfSlides));
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = true;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = true;
+            numberOfSlides++;
+            xhtml.startElement("div");
+        } else if ("key:master-slide".equals(qName)) {
+            inSlide = true;
+            xhtml.startElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = true;
+            xhtml.startElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+            inBody = true;
+        } else if ("key:size".equals(qName) && !inTheme) {
+            String width = attributes.getValue("sfa:w");
+            String height = attributes.getValue("sfa:h");
+            metadata.set(PRESENTATION_WIDTH, width);
+            metadata.set(PRESENTATION_HEIGHT, height);
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = true;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = true;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = true;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = true;
+        } else if (inMetaDataTitle && "key:string".equals(qName)) {
+            metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
+        } else if (inMetaDataAuthors && "key:string".equals(qName)) {
+            metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            tableId = attributes.getValue("sfa:ID");
+            xhtml.startElement("table");
+        } else if (tableId != null && "sf:columns".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
+            currentColumn = 0;
+        } else if (tableId != null && "sf:ct".equals(qName)) {
+            parseTableData(attributes.getValue("sfa:s"));
+        } else if (tableId != null && "sf:n".equals(qName)) {
+            parseTableData(attributes.getValue("sf:v"));
+        } else if ("sf:p".equals(qName)) {
+            xhtml.startElement("p");
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = false;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = false;
+            xhtml.endElement("div");
+        } else if ("key:master-slide".equals(qName)) {
+            inSlide = false;
+            xhtml.endElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = false;
+            xhtml.endElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+            inBody = false;
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = false;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = false;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = false;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = false;
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            xhtml.endElement("table");
+            tableId = null;
+            numberOfColumns = null;
+            currentColumn = null;
+        } else if ("sf:p".equals(qName)) {
+            xhtml.endElement("p");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (inParsableText && inSlide && length != 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    private void parseTableData(String value) throws SAXException {
+      if (currentColumn == 0) {
+          xhtml.startElement("tr");
+      }
+      xhtml.element("td", value);
+
+      currentColumn++;
+      if (currentColumn.equals(numberOfColumns)) {
+          xhtml.endElement("tr");
+          currentColumn = 0;
+      }
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
new file mode 100644
index 0000000..2ee64be
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class NumbersContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSheet = false;
+
+    private boolean inText = false;
+    private boolean parseText = false;
+
+    private boolean inMetadata = false;
+    private Property metadataKey;
+    private String metadataPropertyQName;
+
+    private boolean inTable = false;
+    private int numberOfSheets = 0;
+    private int numberOfColumns = -1;
+    private int currentColumn = 0;
+
+    private Map<String, String> menuItems = new HashMap<String, String>();
+    private String currentMenuItemId;
+
+    NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Office.PAGE_COUNT, String.valueOf(numberOfSheets));
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = true;
+            numberOfSheets++;
+            xhtml.startElement("div");
+            String sheetName = attributes.getValue("ls:workspace-name");
+            metadata.add("sheetNames", sheetName);
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = true;
+            xhtml.startElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = true;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = true;
+            return;
+        }
+
+        if (inMetadata && metadataKey == null) {
+            metadataKey = resolveMetadataKey(localName);
+            metadataPropertyQName = qName;
+        }
+
+        if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
+            metadata.add(metadataKey, attributes.getValue("sfa:string"));
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            String tableName = attributes.getValue("sf:name");
+            xhtml.startElement("div");
+            xhtml.characters(tableName);
+            xhtml.endElement("div");
+            inTable = true;
+            xhtml.startElement("table");
+            xhtml.startElement("tr");
+            currentColumn = 0;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+            menuItems = new HashMap<String, String>();
+        }
+
+        if (inTable && "sf:grid".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
+        }
+
+        if (menuItems != null && "sf:t".equals(qName)) {
+            currentMenuItemId = attributes.getValue("sfa:ID");
+        }
+
+        if (currentMenuItemId != null && "sf:ct".equals(qName)) {
+            menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
+        }
+
+        if (inTable && "sf:ct".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sfa:s"));
+            currentColumn++;
+        }
+
+        if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", attributes.getValue("sf:v"));
+            currentColumn++;
+        }
+
+        if (inTable && "sf:proxied-cell-ref".equals(qName)) {
+            if (currentColumn >= numberOfColumns) {
+                currentColumn = 0;
+                xhtml.endElement("tr");
+                xhtml.startElement("tr");
+            }
+
+            xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
+            currentColumn++;
+        }
+
+        if ("sf:chart-name".equals(qName)) {
+            // Extract chart name:
+            xhtml.startElement("div", "class", "chart");
+            xhtml.startElement("h1");
+            xhtml.characters(attributes.getValue("sfa:string"));
+            xhtml.endElement("h1");
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (parseText && length > 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if ("ls:workspace".equals(qName)) {
+            inSheet = false;
+            xhtml.endElement("div");
+        }
+
+        if ("sf:text".equals(qName)) {
+            inText = false;
+            xhtml.endElement("p");
+        }
+
+        if ("sf:p".equals(qName)) {
+            parseText = false;
+        }
+
+        if ("sf:metadata".equals(qName)) {
+            inMetadata = false;
+        }
+
+        if (inMetadata && qName.equals(metadataPropertyQName)) {
+            metadataPropertyQName = null;
+            metadataKey = null;
+        }
+
+        if (!inSheet) {
+            return;
+        }
+
+        if ("sf:menu-choices".equals(qName)) {
+        }
+
+        if ("sf:tabular-model".equals(qName)) {
+            inTable = false;
+            xhtml.endElement("tr");
+            xhtml.endElement("table");
+        }
+
+        if (currentMenuItemId != null && "sf:t".equals(qName)) {
+            currentMenuItemId = null;
+        }
+    }
+
+    private Property resolveMetadataKey(String localName) {
+        if ("authors".equals(localName)) {
+            return TikaCoreProperties.CREATOR;
+        }
+        if ("title".equals(localName)) {
+            return TikaCoreProperties.TITLE;
+        }
+        if ("comment".equals(localName)) {
+            return TikaCoreProperties.COMMENTS;
+        }
+        return Property.internalText(localName);
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 0000000..f3fd873
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.apple.BPListDetector
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..c922b2e
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,22 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.apple.AppleSingleFileParser
+org.apache.tika.parser.apple.PListParser
+org.apache.tika.parser.iwork.iwana.IWork13PackageParser
+org.apache.tika.parser.iwork.iwana.IWork18PackageParser
+org.apache.tika.parser.iwork.IWorkPackageParser
+
+
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
new file mode 100644
index 0000000..65e7121
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.parser.iwork.AutoPageNumberUtils;
+import org.junit.Test;
+
+/**
+ * Test class for the <code>AutoPageNumberUtils</code> helper class.
+ */
+public class AutoPageNumberUtilsTest {
+
+	/**
+	 * Check upper-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaUpper() {
+		assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+		assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+		assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+		assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+		assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+		assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+	}
+
+	/**
+	 * Check lower-case alpha-numeric numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testAlphaLower() {
+		assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+		assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+		assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+		assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+		assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+		assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+	}
+
+	/**
+	 * Check upper-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanUpper() {
+		assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+		assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+		assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+	}
+
+	/**
+	 * Check lower-case Roman numerals numbers are generated based on the 
+	 * input page number.
+	 */
+    @Test
+	public void testRomanLower() {
+		assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+		assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+		assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+	}
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
new file mode 100644
index 0000000..5c4d5d1
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests if the IWork parser parses the content and metadata properly of the supported formats.
+ */
+public class IWorkParserTest extends TikaTest {
+
+    private IWorkPackageParser iWorkParser;
+
+    @Before
+    public void setUp() {
+        iWorkParser = new IWorkPackageParser();
+    }
+
+    /**
+     * Check the given InputStream is not closed by the Parser (TIKA-1117).
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testStreamNotClosed() throws Exception {
+        InputStream input = getResourceAsStream("/test-documents/testKeynote.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        iWorkParser.parse(input, handler, metadata);
+        input.read();   // Will throw an Exception if the stream was already closed.
+    }
+
+    @Test
+    public void testParseKeynote() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testKeynote.key", iWorkParser, metadata);
+
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+//        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        
+        // Check the metadata values
+        assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("3", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+        assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+
+        assertContains("A sample presentation", content);
+        assertContains("For the Apache Tika project", content);
+        assertContains("Slide 1", content);
+        assertContains("Some random text for the sake of testability.", content);
+        assertContains("A nice comment", content);
+        assertContains("A nice note", content);
+
+        // test table data
+        assertContains("Cell one", content);
+        assertContains("Cell two", content);
+        assertContains("Cell three", content);
+        assertContains("Cell four", content);
+        assertContains("Cell 5", content);
+        assertContains("Cell six", content);
+        assertContains("7", content);
+        assertContains("Cell eight", content);
+        assertContains("5/5/1985", content);
+    }
+
+    // TIKA-910
+    @Test
+    public void testKeynoteTextBoxes() throws Exception {
+        String content = getText("testTextBoxes.key", iWorkParser);
+        assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
+    }
+
+    // TIKA-910
+    @Test
+    public void testKeynoteBulletPoints() throws Exception {
+        String content = getText("testBulletPoints.key", iWorkParser);
+        assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
+    }
+
+    // TIKA-923
+    @Test
+    public void testKeynoteTables() throws Exception {
+        String content = getText("testTables.key", iWorkParser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("row 1 row 2 row 3", content);
+    }
+
+    // TIKA-923
+    @Test
+    public void testKeynoteMasterSlideTable() throws Exception {
+        String content = getText("testMasterSlideTable.key", iWorkParser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("master row 1", content);
+        assertContains("master row 2", content);
+        assertContains("master row 3", content);
+    }
+
+    @Test
+    public void testParsePages() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testPages.pages", iWorkParser, metadata);
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.PAGE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.LANGUAGE.getName()));
+        
+        // Check the metadata values
+        assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("2010-05-09T21:34:38+0200", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2010-05-09T23:50:36+0200", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+        assertEquals("2", metadata.get(Office.PAGE_COUNT));
+
+        // text on page 1
+        assertContains("Sample pages document", content);
+        assertContains("Some plain text to parse.", content);
+        assertContains("Cell one", content);
+        assertContains("Cell two", content);
+        assertContains("Cell three", content);
+        assertContains("Cell four", content);
+        assertContains("Cell five", content);
+        assertContains("Cell six", content);
+        assertContains("Cell seven", content);
+        assertContains("Cell eight", content);
+        assertContains("Cell nine", content);
+        assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
+
+        // text on page 2
+        assertContains("A second page....", content);
+        assertContains("Extensible Markup Language", content); // ...
+    }
+
+    // TIKA-904
+    @Test
+    public void testPagesLayoutMode() throws Exception {
+        String content = getText("testPagesLayout.pages");
+        assertContains("text box 1 - here is some text", content);
+        assertContains("created in a text box in layout mode", content);
+        assertContains("text box 2 - more text!@!$@#", content);
+        assertContains("this is text inside of a green box", content);
+        assertContains("text inside of a green circle", content);
+    }
+
+    @Test
+    public void testParseNumbers() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testNumbers.numbers", iWorkParser, metadata);
+
+        // Make sure enough keys came through
+        // (Exact numbers will vary based on composites)
+        assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
+        List<String> metadataKeys = Arrays.asList(metadata.names());
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.PAGE_COUNT.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+        
+        // Check the metadata values
+        assertEquals("2", metadata.get(Office.PAGE_COUNT));
+        assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
+
+        assertContains("Category", content);
+        assertContains("Home", content);
+        assertContains("-226", content);
+        assertContains("-137.5", content);
+        assertContains("Checking Account: 300545668", content);
+        assertContains("4650", content);
+        assertContains("Credit Card", content);
+        assertContains("Groceries", content);
+        assertContains("-210", content);
+        assertContains("Food", content);
+        assertContains("Try adding your own account transactions to this table.", content);
+    }
+
+    // TIKA- 924
+    @Test
+    public void testParseNumbersTableNames() throws Exception {
+        String content = getText("tableNames.numbers", iWorkParser);
+        assertContains("This is the main table", content);
+    }
+        
+    @Test
+    public void testParseNumbersTableHeaders() throws Exception {
+        String content = getText("tableHeaders.numbers");
+        for(int header = 1;header <= 5;header++) {
+          assertContains("header" + header, content);
+        }
+        for(int row = 1;row <= 3;row++) {
+          assertContains("row" + row, content);
+        }
+    }
+
+    /**
+     * We don't currently support password protected Pages files, as
+     *  we don't know how the encryption works (it's not regular Zip
+     *  Encryption). See TIKA-903 for details
+     */
+    @Test
+    public void testParsePagesPasswordProtected() throws Exception {
+        // Document password is "tika", but we can't use that yet...
+        Metadata metadata = new Metadata();
+        String content = getText("testPagesPwdProtected.pages", iWorkParser, metadata);
+        assertEquals("", content);
+       
+        // Will have been identified as encrypted
+        assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+    }
+    
+    /**
+     * Check we get headers, footers and footnotes from Pages
+     */
+    @Test
+    public void testParsePagesHeadersFootersFootnotes() throws Exception {
+        String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+        String header = "THIS IS SOME HEADER TEXT";
+        String footer = "THIS IS SOME FOOTER TEXT\t1";
+        String footer2 = "THIS IS SOME FOOTER TEXT\t2";
+
+        String content = getText("testPagesHeadersFootersFootnotes.pages", iWorkParser);
+
+        // Check regular text
+        assertContains("Both Pages 1.x", content); // P1
+        assertContains("understanding the Pages document", content); // P1
+        assertContains("should be page 2", content); // P2
+       
+        // Check for headers, footers and footnotes
+        assertContains(header, content);
+        assertContains(footer, content);
+        assertContains(footer2, content);
+        assertContains(footnote, content);
+    }
+    
+    /**
+     * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersFootersRomanUpper() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\tI";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tII";
+
+       String content = getText("testPagesHeadersFootersRomanUpper.pages", iWorkParser);
+
+       // Check for headers, footers and footnotes
+       assertContains(header, content);
+       assertContains(footer, content);
+       assertContains(footer2, content);
+    }
+    
+    /**
+     * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersFootersRomanLower() throws Exception {
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT\ti";
+       String footer2 = "THIS IS SOME FOOTER TEXT\tii";
+
+       String content = getText("testPagesHeadersFootersRomanLower.pages", iWorkParser);
+
+       // Check for headers, footers and footnotes
+       assertContains(header, content);
+       assertContains(footer, content);
+       assertContains(footer2, content);
+    }
+
+    /**
+     * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersAlphaUpper() throws Exception {
+        String header = "THIS IS SOME HEADER TEXT\tA";
+        String footer = "THIS IS SOME FOOTER TEXT\tA";
+        String footer2 = "THIS IS SOME FOOTER TEXT\tB";
+
+        String content = getText("testPagesHeadersFootersAlphaUpper.pages", iWorkParser);
+
+        // Check for headers, footers and footnotes
+        assertContains(header, content);
+        assertContains(footer, content);
+        assertContains(footer2, content);
+    }
+ 
+    /**
+     * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
+     */
+    @Test
+    public void testParsePagesHeadersAlphaLower() throws Exception {
+        String header = "THIS IS SOME HEADER TEXT";
+        String footer = "THIS IS SOME FOOTER TEXT\ta";
+        String footer2 = "THIS IS SOME FOOTER TEXT\tb";
+
+        String content = getText("testPagesHeadersFootersAlphaLower.pages", iWorkParser);
+
+        // Check for headers, footers and footnotes
+        assertContains(header, content);
+        assertContains(footer, content);
+        assertContains(footer2, content);
+    }
+    
+    /**
+     * Check we get annotations (eg comments) from Pages
+     */
+    @Test
+    public void testParsePagesAnnotations() throws Exception {
+        String commentA = "comment about the APXL file";
+        String commentB = "comment about UIMA";
+
+        String content = getText("testPagesComments.pages", iWorkParser);
+
+        // Check regular text
+        assertContains("Both Pages 1.x", content); // P1
+        assertContains("understanding the Pages document", content); // P1
+        assertContains("should be page 2", content); // P2
+       
+        // Check for comments
+        assertContains(commentA, content);
+        assertContains(commentB, content);
+    }
+    
+    // TIKA-918
+    @Test
+    public void testNumbersExtractChartNames() throws Exception {
+        String content = getText("testNumbersCharts.numbers");
+        assertContains("Expenditure by Category", content);
+        assertContains("Currency Chart name", content);
+        assertContains("Chart 2", content);
+    }
+
+    //TIKA-3020
+    @Test
+    public void testKeyNoteTableMarkup() throws Exception {
+        String expected = "<table><tr>\t<td>Cell one</td>\t<td>Cell two</td>\t<td>Cell three</td></tr>" +
+                "<tr>\t<td>Cell four</td>\t<td>Cell 5</td>\t<td>Cell six</td></tr>" +
+                "<tr>\t<td>7</td>\t<td>Cell eight</td>\t<td>5/5/1985</td></tr>" +
+                "</table>";
+        String xml = getXML("testKeynote.key", iWorkParser).xml;
+        xml = xml.replaceAll("[\r\n]", "");
+        assertContains(expected, xml);
+
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testAppleSingleFile.pdf
new file mode 100644
index 0000000..a407ded
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testAppleSingleFile.pdf differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testKeynote2013.key b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testKeynote2013.key
new file mode 100644
index 0000000..d0dd416
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testKeynote2013.key differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testMasterSlideTable.key b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testMasterSlideTable.key
new file mode 100644
index 0000000..2627770
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testMasterSlideTable.key differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers.numbers b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers.numbers
new file mode 100644
index 0000000..51360e0
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers.numbers differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers2013.numbers b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers2013.numbers
new file mode 100644
index 0000000..3f9a013
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testNumbers2013.numbers differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPages2013.pages b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPages2013.pages
new file mode 100644
index 0000000..b82ac7a
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPages2013.pages differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesComments.pages b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesComments.pages
new file mode 100644
index 0000000..d7ff81c
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesComments.pages differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
new file mode 100644
index 0000000..cfecc8c
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesPwdProtected.pages b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesPwdProtected.pages
new file mode 100644
index 0000000..788b516
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testPagesPwdProtected.pages differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testWEBARCHIVE.webarchive b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testWEBARCHIVE.webarchive
new file mode 100644
index 0000000..b78643a
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/resources/test-documents/testWEBARCHIVE.webarchive
@@ -0,0 +1,646 @@
+bplist00�_WebMainResource_WebSubresources�	
+
_WebResourceData_WebResourceMIMEType_WebResourceTextEncodingName_WebResourceFrameName^WebResourceURLOP�<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+--><html xmlns="http://www.w3.org/1999/xhtml"><head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+    <title>Apache Tika - Apache Tika</title>
+    <style type="text/css" media="all">
+      @import url("./css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="./tikaNoText16.png">
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://search.lucidimagination.com/p:tika";
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onload="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org" id="bannerLeft" title="Apache Tika"><img src="http://tika.apache.org/tika.png" alt="Apache Tika" width="292" height="100"></a>
+        <a href="http://www.apache.org/" id="bannerRight" title="The Apache Software Foundation"><img src="http://tika.apache.org/asf-logo.gif" alt="The Apache Software Foundation" width="387" height="100"></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or more --><!-- contributor license agreements.  See the NOTICE file distributed with --><!-- this work for additional information regarding copyright ownership. --><!-- The ASF licenses this file to You under the Apache License, Version 2.0 --><!-- (the "License"); you may not use this file except in compliance with --><!-- the License.  You may obtain a copy of the License at --><!--  --><!-- http://www.apache.org/ [...]
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+              <strong>Introduction</strong>
+        </li>
+              
+    <li class="none">
+                    <a href="download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/" class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA" class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="expanded">
+                    <a href="1.0/index.html">Apache Tika 1.0</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="1.0/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="1.0/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="1.0/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="1.0/parser_guide.html">Parser 5min Quick Start Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="1.0/detection.html">Content and Language Detection</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="1.0/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="0.10/index.html">Apache Tika 0.10</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="0.9/index.html">Apache Tika 0.9</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="0.8/index.html">Apache Tika 0.8</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/" class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/" class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/" class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/sponsorship.html" class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html" class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika" method="get" id="searchform">
+              <input type="text" id="query" name="q">
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search" onclick="selectProvider(this.form)">
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/" title="Tika in Action"><img src="./mattmann_cover150.jpg" width="150" height="186"></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright © 2011
+          <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/">Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com">Lucid Imagination</a>
+          and <a href="http://sematext.com">Sematext</a>.
+          <br>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  
+
+</body></html>Ytext/htmlUUTF-8P_http://tika.apache.org/�$�_WebResourceResponseO�/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+body {
+    font-family: serif;
+    font-size: 13pt;
+    background-color: #eee;
+    margin: 0;
+    padding: 0;
+}
+
+#body {
+    width: 800px;
+    height: 100%;
+    margin: 20px auto;
+    left: auto;
+    right: auto;
+    background-color: white;
+    padding: 20px;
+    border: 1px solid #CCC;
+    -moz-border-radius: 15px;
+    border-radius: 15px;
+    -moz-box-shadow: 1ex 1ex 1ex #666;
+    -webkit-box-shadow: 1ex 1ex 1ex #666;
+    box-shadow: 5px 5px 5px #666;
+}
+
+#banner {
+    height: 100px;
+    padding-bottom: 1em;
+    border-bottom: 1px solid #eee;
+}
+
+#bannerLeft {
+    float: left;
+}
+
+#bannerRight {
+    float: right;
+}
+
+#content {
+    width: 600px;
+    float: left;
+    line-height: 1.3em;
+}
+
+#navigation {
+    width: 180px;
+    float: right;
+    font-size: 12px;
+}
+
+#navigation h5 {
+    font-size: 12px;
+    margin-bottom: 1ex;
+}
+
+#navigation ul {
+    margin: 0;
+    padding: 0;
+}
+
+#navigation li {
+    list-style-type: none;
+    list-style-position: inside;
+}
+
+#navigation li ul {
+    margin-left: 20px;
+}
+
+#navigation li.expanded {
+    list-style-type: disc;
+}
+
+#navigation li.collapsed {
+    list-style-type: circle;
+}
+
+#navigation strong {
+    font-weight: normal;
+}
+
+#navigation a {
+    text-decoration: none;
+}
+
+#navigation form {
+    text-align: right;
+}
+
+#query {
+    width: 100%;
+    border: 1px solid #eee;
+}
+
+#searchProvider, #submit {
+    width: 48%;
+}
+
+#bookpromo p {
+    text-align: center;
+}
+
+#footer {
+    clear: both;
+    border-top: 1px solid #eee;
+    font-size: 8pt;
+    color: gray;
+    text-align: center;
+}
+
+h1, h2, h3, h4, h5, h6 {
+    font-family: sans-serif;
+    color: #900;
+}
+
+li {
+    margin-top: 2px;
+}
+
+a:link {
+    color: #36a;
+}
+a:visited  {
+    color:#47a;
+}
+a:active, a:hover {
+    color:#69c;
+}
+a.externalLink {
+    background: url(../images/external.png) right center no-repeat;
+    padding-right: 18px;
+}
+
+img {
+    border: 0;
+}
+
+pre {
+    border: 1px solid #ccc;
+    background-color: #eee;
+    padding: 1ex;
+    overflow: auto;
+}
+
+/* From maven-theme.css */
+
+table.bodyTable th {
+  color: white;
+  background-color: #bbb;
+  text-align: left;
+  font-weight: bold;
+}
+
+table.bodyTable th, table.bodyTable td {
+  font-size: 1em;
+}
+
+table.bodyTable tr.a {
+  background-color: #ddd;
+}
+
+table.bodyTable tr.b {
+  background-color: #eee;
+}
+
+dt {
+  color: #900;
+  font-weight: bold;
+}
+dd {
+  margin-bottom: 1ex;
+}
+
+.errormark, .warningmark, .donemark, .infomark {
+  background: url(../images/icon_error_sml.gif) no-repeat;
+}
+
+.warningmark {
+  background-image: url(../images/icon_warning_sml.gif);
+}
+
+.donemark {
+  background-image: url(../images/icon_success_sml.gif);
+}
+
+.infomark {
+  background-image: url(../images/icon_info_sml.gif);
+}
+
+/* From maven-base.css */
+
+table {
+  padding:0px;
+  width: 100%;
+  margin-left: -2px;
+  margin-right: -2px;
+}
+acronym {
+  cursor: help;
+  border-bottom: 1px dotted #feb;
+}
+table.bodyTable th, table.bodyTable td {
+  padding: 2px 4px 2px 4px;
+  vertical-align: top;
+}
+
+Xtext/css_#http://tika.apache.org/css/site.cssO�bplist00�noX$versionX$objectsY$archiverT$top���""()012NOPQRSTUVWXYZ[\]^_`abcdhiU$null�	
+
 !R$6S$10R$2R$7R$3S$11R$8V$classR$4R$9R$0R$5R$1���� �!����#$%&[NS.relativeWNS.base���_#http://tika.apache.org/css/site.css�*+,-Z$classnameX$classesUNSURL�./UNSURLXNSObject#A���6���3456BWNS.keysZNS.objects��789:;<=>?@A�	�
+���
�������CDEFGHIJKLM�����������TVaryVServerZConnection]Last-Modified\Content-Type]Accept-RangesTDate_Content-Encoding^Content-LengthZKeep-AliveTEtag_Accept-Encoding_:Apache/2.3.15-dev (Unix) mod_ssl/2.3.15-dev OpenSSL/1.0.0cZKeep-Alive_Sun, 31 Oct 2010 21:50:22 GMTXtext/cssUbytes_Tue, 13 Dec 2011 18:55:14 GMTTgzipT1418_timeout=5, max=100_"d1b503-eaa-493f0adabe380-gzip"�*+ef_NSMutableDictionary�eg/\NSDictionary��*+jk_NSHTTPURLResponse�lm/_NSHTTPURLRespon [...]
+
+
IHDR	Ӻ&gAMA��7��tEXtSoftwareAdobe ImageReadyq�e<PLTEuuu���������PtRNS���@*��PIDATx�b`&& @P6#�@`����X�Ć2��� ��d@A�3�� (�*���tIEND�B`�Yimage/png_*http://tika.apache.org/images/external.pngOVbplist00�fgX$versionX$objectsY$archiverT$top���"()012JKLMNOPQRSTUVWXYZ[\`aU$null�	
+
 !R$6S$10R$2R$7R$3S$11R$8V$classR$4R$9R$0R$5R$1���������#$%&[NS.relativeWNS.base���_*http://tika.apache.org/images/external.png�*+,-Z$classnameX$classesUNSURL�./UNSURLXNSObject#A����%���3456@WNS.keysZNS.objects��789:;<=>?�	�
+���
�����ABCDEFGHI���������VServerZConnection\Content-Type]Last-Modified]Accept-RangesTDate^Content-LengthZKeep-AliveTEtag_:Apache/2.3.15-dev (Unix) mod_ssl/2.3.15-dev OpenSSL/1.0.0cZKeep-AliveYimage/png_Tue, 24 Aug 2010 12:56:41 GMTUbytes_Tue, 13 Dec 2011 18:55:14 GMTS230_timeout=5, max=100_"8a3e79-e6-48e914bdcb440"�*+]^_NSMutableDictionary�]_/\NSDictionary��*+bc_NSHTTPURLResponse�de/_NSHTTPURLResponse]NSURLResponse_NSKeyedArchiver�hi_WebResourceResponse� [...]
+
+
IHDR$d/�M4sBIT|d�	pHYs00?��tEXtSoftwarewww.inkscape.org��<BtEXtTitlegenerated by pstoedit version:3.45 from Z:/asf_logo_1999.eps�ŉ] IDATx��w|\W�7�}�;}F3��*�츷$N�!	�F���,��bvi?��/�^,���!�$!��ĉ��8q��d���F�˝���8�ZWW#i$ى�����c�{�s�s��Č�@D�@�cfÖ���2s���!��̬��8h����*�03�ly�f���\2_�ZW	�d�����̣�G��2,p�1�i�Wx�
+�>-�"Z`���S�<�j�g> "
@c��4�8�j=�'������C0��" �0�" �Qf.H"�X�f#�<^Yf@E�9#{s1��:X!������2-�#�$�1f���� �l�Y�@�,�����l��\~�0`ɣ��� ^���H����1F>��Z�e�9�E���4�^0s�b�]BI ��a}~���b�a���P`%�>*�Y ��0s��#���"���G���B,"0s��G�;�@�c��(���c0��p{���h���"�� ��L�[�әybQ���y� �|��]�6��yB��&93�Ma9���9"�('˙k��r��&��Q�H�����}*�7������Y���C��~f��lG峹���o�:}��վ4Y��<�qa�|�㆙O�+��S�yL2d���<��Ky�̫�t;�8c,m�J�g��M�#�H��[H�L�т@D�E���^�
+@V�F=�t`.ؿW,���
+�ԅ�@����Nf�ba�ːu��<��m(FX��g%���+�_{]s�w!�2Ff=%���OF�>��E�0Ѵ�r��@�e�a`��վSwn
�#K�<�Q���G�,>��1���NY�QB��g~k/o�m��ݥ�o�1� 4���Y�	���$Ȫ�Y^��3���[��Q�c[Ĝۋ=�e�,Ǎ���iy��J�X,�X���șy,Ǿ�J6�oJ)/k)�$�-O��Z�]���TW��2F��L�'��J���co��ؗ�g*B{�z-�y��� H"��N�]IZ6���	�����<��y"JXFDY��+�h�*�Z��<���#��,w�RW�.�"�qfNA�d�^�j��̴�,�_�'v��q��|Y^�J�!Ȧ
+?,ۭ�<i)�NQ-&��Y�iZ����(��1��a��8**����o�e][�9g-���ݻ��|�`d;=!8��M4�e��^��n�fI۶��`
+�M��6��j�P�vH�m�	f�%���6��{+2"�X����Sw���V�����`��m��!Ywy�bd)�8a��e=�覽����I[����k�R�1�k��>e�9�jUU�WԽ9ϭް��U�+�V�=V�մxㅌg��#ه<�=#��1*r\^g�3�Y��a�`��r�(�l@jb%tk;eݵ�&��c��[K���d�n�g*)���ȉ�ݲ�`�d=��'#��s�d�J���'-Z*s��B�G�x�0`|�ce�C5���	���!N ��Z�W��E�y	������x=�ԵW6o���|��-e��]�"CC��;�����o?��U��5�?��}�mwyd��Q�
�W���)Ӛ�/�Vb�*��[^
3�3�8����$���[�
���UC؏ͥqdCXl��iw��m7���s;lݔ-e%Hr��ѯ��x�����������t@��"L��QY�I�'�a��>�7k����M�]z^Æ��íc���Ym�u�=6t쪭�Ձ2M���^N�
+ޭ���_\]^Y�\��]�f��c��|�T���!	ǹZ131���֗Xg�M��R�b�ODu�Tp<#�I@.��ل���CK=!Y�\/�))D��E��(c����P���^A����p�[�k���xF9ڕ�������#��s][�1a �y}_w�ۏԼ�
M���jG��r�??�7������"�@3L�Y�dR&61aq����b|�d��!�� |�2�:W��w{�YO��G��A�Pl!����PI7h�%��K��pȺ�Dh����E����C���|
+��b4C?�@�ZQ�u$ ��^YO��M9�{|���A:�p�
�7^��{O����
���'Fo��uŝ?�Pr��2�	H1��3�Md��j���G��
+���;L|q�Ͱ�����	���JLr�cvڏ/�w:���r"���A��=6�̆��!���W6��<W�0�
�7�|M�
5a��`6�ä=�>�����F�#��i:e���V05`:1�}�F̤be#&�c�[z���LGIG6�QH���4�(X$���*d>���b)O��Q�(,�9,�DT&����DT)h{�v�,�#۠��D䟡OZ	}Rmy��'�������Z~���5׿��Xt˪��o=3r9C�a�)�?Ս�|���t&����ظ�wׯ���t�?6�������͛����$Rci�0��!����<`���Σ�d)R	�Қׂus�"`�� �pZ}��1�8�9�V�^d�]�	���P'�?C�K�i��3O#F��^/�^7�,%��z��g��ř8+龴S�A�}���ʐj!v�FL������Uvr�����(�l�t.�
�E�XaY���I=��� ����e��P���P+Ĥ���!v6�-m��~�,E.���
+Y��z٧e���B3�}2�����X��+[��֕�W�`�
+gj�cC��W�{|�`fcKy5gϼ0R��3�h������ryÍ�6�غ�:������מ�ٟFLe�KE
����l�@��*�9���������:_�T�*?��0S��ťhN�:��T�չdI�z�f��s�ǜ�,|@g�,����l�S��1f����I+�E��e��:�0�a��`7t܏�(U��~f> hr/�|����fE,9�������Z.�Dd9Se%��pH�|6w�;��݃�O�Լr���8^Ę�O���;3��>MY�8�����%!3�����o�f�o�YQs��p�;�sm�C5�L��w�#!���y����9T�fyo��A���>�l��mt�@���`�!f>�I��|Q��h&ع��=��-E�.�K���ba�:R���}��ܳq�+ך�>�� 9���i�>p\�ee�%����@�7�ya��ڳ���TW�,�s"��L/��,�Zk�b�1w���Ŭ�gj�=�
+��g�)�'�6��]�����
+_Y3�b{�omsY�\䈁̄����w�~�g/y~��[�9'�����q��u囗U_���O
����z�㳴aNHU�u2e!�Y�B��Ǡ��/G���8�m��b-�#���NS�H��e18�bQҶ�b�ـ8mX���b\����WΥa�����=n -�-�z���y�3���/xk��9W��4��Cd�}25'�<�2M����s:Qs��_�t���C��Y���Q�M�toOJ?sMEp�H:w��]��x���w���f3�w?�3 �H�J&Sp{�jpp0��]]S� g#�3��H�W���d�X��q%��0s��r�\`~"r̵�$M	˿ �"4��a7V5
�Ó<m>��N�f�)�۞�{��pɘ/A�m���t�֜l.؎3E�0��1�ؘy���zL�T�l�TbU����T��X;��/h��ݛ??p�p~�Վ#��J��H��ʪ���;��?��'����ʯ�q�'X���E��8	�<�x��w���~y��?��f�B
m�������B8�Uq��0=��qH�B6i��K�}S1a'
 \2���sm6[7�`�f"��G=�tb�`��I���D
+b!�HM��Ԟ'���?�Sl{2>g.)�0�,���َ��<I�㠜�>�f+�:Gy�F����h��ѕ�e�tm���'x됛2p�vWd��Õu4��o��7�8��k��3����9�yu��/�r�	8�όD�c��e���SO���2C�J�H��Y��bsi�����&3��nN������<V���@*��,��E�^�B�K�	uӐ��\����/�T��0ďꁰ��H��J��ho�d�BL��rR�a�"a=Z񃶐{��cD�%��?�2D,	s�Zv�+�],B��ɺ�d;��<
+���
��`���DT!x�A��RP
"Z��i2OL�9�e=+e�c�j��* Xx�&"��+n����l�޻+�c���W7��;b�VQ��8ؠ�r,
+�[<�ǿ�s�Wu��sm{jÆ����9<�����m���j\����q��}?�g{߯����`��'�T�����v�����D�e�<&�\�2̏ ���%�L��L'F1���!�����:F+Q<��Bͼ�pJ��)�CJA�)��x�N�R�,pܢw7�f�Z"1���̣@x�i���*h5d]D���N0�fMY��X�O㘜��m3�D�l�n�Sbb����Vܤ���������pw�;S7���@�:Ԩ�F�ڼ�T�����c��t���<7ռ2��E����t<�'���(�՗��u�Z���~��v�����h.b$5Ev݇��
@��[-9F�����Tn[��s*��Te����>�
+�އ�!��-��GIIN��l6��4F��7M���1����M���J�&\�e�#��LF_�S��r�x��x�����@W/�����]��j��,�#Yh*#�ի�Y�02яs�14見�����m]9c��V��_c�b
~fp��.���'�MdͧO��'�b�4�<\��"*|n�<��bG%�9��R��D���>�vn�����U��a�u��&�;����ZU�nrt�މ@�y���Ai�`�q'&�ZS�0�x�jD��e>��(5ɛ�˽�{�5��$F��N%��W�Ƿs�ۿ�7���x���6���p��ښ�$�CS��Bvg]���1�������J��r�0��%����U���D���j\陠�^�h=7�(�
D��O��eJ�od4����+ee0�(��w�P�Z?Oݠ����a#�g��!#�)��w�Y��.m�����������O`ۥQ�UƢxb>�͖�.��BQhI��y|k��^�*���J�Ixӗ�.�r
��W�>��H�
 ��".<���"�oYu�?�����H���1O����uue�>
+�t(��é[gU*r�"�z�a�̝�G�F_������(��[���ly���Z]���Κ��o����W����'�v�hh��Sc�4���{�Qw�D���<4�wV��h�V���<��c�s@�a����`q�(&o�q@���W�=�j�LF��:�I�B�7�2��|��5Sv���`��'�<��Z/v4��^����!�8a�*H�Q�X���'�}��Ԉ�f�['�}Rֺ|}Ê���
+o�̦��~q��H���iHQ�7�<�"?�?rP�W�z��MBC������3)#��v��[�������H!�8�'��w6�����#c�{i"��+ߛ2vn�߼����r�;�ɺl},ų�T�	R=�)��y!d>��/�Vuŝ!O4eM"�1�v<�'r�&��ay^ID#��M�B���뮚��o��3�!��!�A,� �1U��b��l%�g$Q��d�-�bC��f��^�x-��yX�-���K=���n���!N�-�(v(Q��e�}���!��^LگƤ
�V"z����^o��_��o_W��}#��v�dH�#z���#E��{�C�8��4i9�S���pħ�;��u~�`���h99���𴪪vf�'��Kd����K��iMl���@�}(�q:��-�!��Vрi�Q��*�b1��9I"��)�0�5�* �HD
+3���Xo����Y,�c���&"zr&#L"j�T��R1��4�R!b��9B��߲	@?/0B�3�I��"`+pd.�U��UB��Q,�C,U�vPV|��!~���|V!��ō�#$�'�:�D���Q�c��^�$��z�9��O�(����,�
+[R=D<j�.�l��3l��:��>���3�,���Z7|te�_�M��_��}kYE#f1����@*y-��� �ʓ�'R\�>LQ�ơ����үT�^�y�s-{*��@��@��F9���3��9v�5�?=�|�Dt���iO�d�+� ������yDtt��~\8��>L��r	�đ�
`UBƄ� �a��noo�4M�]���D.�(����ɚanA䣘J�f3���p&	�Eq�&
+A��*1�q�j��ݘd:AXWAl�f-f�63�~b\k0)�[L����Y�T���Ǐ�[;!&��1%0�
7��d2����}V6=)�Y'��� ��\����o�Zs\=����>�Az<����9+�@J	��U%I���z���%��A~�c�?v��+CT�����=�z#�#�jChӆ8�ڧ&{�W�q�-�V�w�ޏ��'�XXQ�H�|C�X$�㲆�W-�f���0�UID^���,�=Α��1��ڑ��l;���^L�
�=O�~�%$YW����SC�Tͧ�W���)����惈���`!�#v�Q�=���Ț�
+�� s2e �RϞ��=sb��Is_}��|�M�>|NU�=_hZV��DU����K5�B���X�J��
+�Y�#x}��U4�3#Hn%UOL`(�R�^Gʘ�QZ��Z;�j"�'��h�����p^-sr�
���ӛW�z~�?|���;���m���{d�����Х��ҕdSa#,�n͐o����Xs��D�`���R�1I�Wa��ڣ$����9MD;d]M(=�F
+�;)���iQK��!�(�4��(5�˩/!d4	�!F�qH3��G�T�?�;�Z�̿sk�Mk_��/�4�z���� 2(��6�<�D��
+��uf�(�!C��wұàU���i[k����*g
+�8?�y���{�Y���C#�鎞[�;�8�8`Np�
�1�H���tb�*�$�"I�� �
���������s�d�n"�d�?�~A���d�y�0�Gu(�.��Dt�˫��B�8�f!6�qٗ��>����|��2��%	�������V@(Iܘt31o1�B��A�Ϣ]g~$
+�ۥ�Ƅ0}r���H\���QB�n���#a�
$��QC����ҧ�lm}�����uǡ�=[P��A���)P��	P&��H�ALy�	���������=f	��ot�1]�6�ݝ;4\���s�d��2
�B�����C���ţ�߰�rf�b?!���=����ɴ�eq�μ�ޑy�
+�n�L%rγ���%,�T�f�����B�KCvM�H�a2ObA���H^;A�o��1��Q^�4�"a�(U�o
+P}j�$3�~�5�.󘱆� ne�B��LAL���������6�z|$<��yHj��CPk?	g��,�˖C,D30�G�~�`�gc2�҈��
+BP�FDu���v�zv}��/��I#�vV%
�X�L��$�a�7I���E.5�tJ��~��{�{���f�u\�.��6RhT[��5FUF
+Jez��<���ec�q�w;q�J�O9Į���b�|PN�͐�e�1:*'o'�sI8��C�/�Zt��xB4K�'mA�P��G8�x%>K�bl���=N�y7M^���F)��9L'�~�]B��Ǩ>L��ʹ�,�Z���Fe�4�C��d�G��j��k[+ׄ���y*�|��x�1�
�f`x�4�L��<9p�0��9��yr�`=������aG��(��W+k��\��ƞ~9#	#��TG�Ч�rT�͜�!�/9�����l��X"�6�W3��5)�&�����؉�-a	�$JU��P<Ɛ�>-ԩ܅fp������,�$tE˛�MR�i�0$[��������ױH>�p�����0�s��6�Q��i�p�p�$�T&ȍ4H�p&w0��ȁܣe:�+����;����pU#�rN#���(��<��Z7R+������]���\���1C����[d�N��l	Kx���
�! IDAT:�5���[_�?���h�-�P+Od������e��!i�yT��Bq��)h%��;�{��
+�����g�4sU,,��8���P�^�V������U�%,��_$A""��k����\��!��4Ot�0Hq&�!�G�!�Q�`���DȃÜ��P"�c�T)PS+�FW:ϯ��ޙ,7�c͈g�!�U�'eUoȡ�|!6�,�:s�É��?����W{,���S	q馺���=߿��LKEDw���#ף�6Q"8��M��AV|`C$��������/��i��Uy�6���58��@��·�b�#�ZN��(�4���9d�܏k��~m�������%,�C�Z6B[���:�*����'C���jD��	K&/��j=�6.�����-�Guvյ��C\nh�� tU8������]���4���`�S�!%�'�*G*�"�����ӫ��1��R|��x1?�[8��#�����7�eJ]-xXG�G�G
'�v!#�q�g���C;���7�wm����߄]FDu2mX�ӭc�P#��AXaǙy&���u� �:f�>��m�� ��
3$?���'�-s	��Ti=�
+�B�o]�BL�����ORkV�ɻ�{05��2O5�{ ]Hx�_�I���i��p)�����0�s2m���Q�#�G�yD.��.���G�_w�s|��{��Օ�9R���J�ir`?�����bx]�B���>�m���3~����+���&5�ߟq{�Ċ�������aF\��#`8�^�:�U��������2��.�kԃ��|G\FD�3�$�Ch����B��]�t"�b���u_Y ���>�܂.C�X؍ �|p�47�%�t�Q��� ��[�K۠:f��|V!��g�M�n����Ͷ�Y�H���D��f�OD�����@����K�/e����.iYU�h�{���5#ZE�9����Ly(��%ɉ8��9��B�0�]A�I�G>I2F��B.�p��шq��@|�cm���Π�H#�BF~V�cN���+�������]�~��ˑ�,���MD[m������	"J��Jb���8 [...]
a��#�j�m��.�Ǌ�XcW`�q4O�� A2�e*DOs{nzg{�ş�rŇ���7�!�xЙ�_�
5�<��/''v"���?<7R�aBu��ȃi0������xo~���x�gv�s}�V]{�xEa�U��|٬���rN
+^�W� �\��p�=���c��:>���Ϙ��>��!5��qz���9Fi"b�c�՟o�3�㵂�����a��#.L�OJb�s�BR�4���)Y�a�����-?�n������U��@�^�a��ΐ=�|�C��B,�q�\����ԛ����$iHå�(^ o8�S�7�`�+�l���J��E��&Ǌ�e��`�>�e0P��vBC
+�w�7y�Մ������;?A<Ku�-u\��$f�By	K8�X��I3���I��e��9ӻ,�no�Y̺��J�iw�����#U�����&�?��h���ǮEUS�()�u0V#�s��My����H��T��&/�9j���� �)�Û���
�2ώ�IU52n���'��a �0��
+JW�#�=���F}39��O|�UΥF)�Bĥ6o���Gx1��&�Yx����j`��GF�[�k+Q��م�kO����d57��e���esw��E�<T��
+:�Af̈%���A*�p��:J���S"� _E�4�7?c�?�ʓ�@a���{�c]E�����9x�a���YdI���ڂJ��i^my{��?��.���1J@{� �����dYl\�1"����Ju	�(U��#��y5��;XD|�pE [�N�	`2���]RH�B�pE��Y���]>LDA�)��{�,��l�_�'W���`�M�l�O��D(�F��<�x]G��	ō9�&MIBC*b�Ϗ�	r�ǈ�uՇ9�'�aP?�7��IZ�u���UjK��3�IxYG�
�X��';�PYx�u"�{��?��|_n�1٧���d��ʰa��a�,VxZ
+�w�Z��r��A�8���w2�D8�7JeC�r�t���Q*�4��i
+�{�3��]W�G����*|4ӆ!�f��� �r��	"��]�?���yǙϠ����6V��h1J&��%��U��,iRx����\�(��B8�Vb��H&��ܩ����L���<C7����I�IG�0�q� �d
6
+��w(�5*�aEq48Ճ��~w��7_>��)q�� ��_�Ϧ
+:Ud�L'�2�YBsf��h�c4FDwB�!���m	K8Q��ڭ���ixR�\�0O.�t�l�{�CG��n���6����z;��e�Y(!$2(���B�	���,�(��a��9���UPkF.EJ�;\�Wu���8�:���2�[�Mp�W��h6�PcB�t�p�+��A�*x�����
+5wd��=O�}����=E�T�I�g4^�)�l��,`� ��}�����(��'Np}���{1��D�[�I�)�:�w�����}���v��U
�#y5<�h)�0 �P�9�Ȃ<�Bn���r�[�ߋ|ڧ�y�յ�G���V*e5N��H���P�)_����� P�,m�H��6��ig�{U!�*$42ps~R��ҍ�����,78|8��/���7x�k!��6A�~u"�� ��Y�'�1����a�'��%�|������4�K?Y��tm��U��#��:v���!F���IhXN��ä�.�;ƌ��������dg�Ҽ2'�HI�ȋG�#C��]�'�����np��}���Tf�,dpH�Mo牣gs9/���Z�����K3��#�_b��1Z�Nu�r�@0�!o��?���
+�B�
+Z �{�5#��BZ�h�\ȑc����,#���{[�����F� 
rF��ɏiHh��;2��u=Ր�ǲ�
�-�f$�����&��8����qCksA��u8Ȁ�0��2�Nd�����?���B�.a	K���\[	a���Sor�BDdT��˖/��KC�f^v�����9������o~7�/zǰ���al�]��ː�(yE�AE�DP�'��zj"J�p�Tp,�%*.�J�"M���<21��˹(��rP��Z���%��W�ƣU�(���z�G��V@�jg�S������<ّݵz����vƾk%F2�o�3l���lc�Т�9F�<�]v-��Ь@Xχe�d'KxM�T�H�4�?`��o�x� �I����q��
+GZ@8�z!����k�/�r]�wvwG�DYe*
+P�$iȓ�T��σ�P#��c��qr#Md8R|(к���Hҫ��������#���eJ�ȵ�s��Ԛ���\�������F�U�Y�T�qЙ���.��JG����N{C�g�������?��Io`���B��"z��M��LcpS)c)���o��b��AD������ ┛�/a	�	��e{��E�~*�g��%m���yL���"*g�1f�_���sB��w++������Qe�$9R9'2�@�r&��%����A����i8��ZB�":P��<_8����g�qc�E_���lR
+�Sk�ˌnAmCֈw�ƛ��f
+��+T^�B��J�Z���\&��|�_��})�Ɲ���Q���:��Ҿ��k 	�"��n\���̷��k �����%"�
+���,a	�$��fO�����LD�ǖ1ؠyCT*ߴ��l[}�8�Hs&a�&�Q(��\J�ȡE�p�����Hc��4�����!7�R��^��{�����J@	���0s��P�c�ӹ��Χ���5�������zXEݔƮH�G�Xy������A��i����:a���=�Zޗ����[l�jg��T��#����1Z�4t�07@pѫ �Cp�!;�1�����1�mc�{`�,֗m�4�Ơ�C\����_��/_���b�:�Pȩ�ȅXM~�<:����R�!��9�ÍeJ�};��W�5a���nW�@�RQ?��Φ���.�hڝ����q�W	
CYO��G����Ft�rh�s�w3Cc���CC6_��H�c;��_��fLuňCX��d���h�f%H�1r��(���<�M�S�
��g�8�W��h7f��tƶS�H�Cܣ�3�y&��k��%�Y	�*����i�=w�P??�݅�+'� ��@P]�ʞ�͡�����94��^�К
 �S,��}�ir q�	r�ѣ��t���i�B;
+��mJ�>
q+($��a�&��Q�s��Qc���(��FG�~�����z�/%�T�2*oyle��T��qwa(�0�{%�@���Q��9[-9�O&�2�,��B�c�i��emc�t+�b�j��9����E$�`w3�R��~U�!�݃ى� ��"F"(� d����h�	"���lѷ|u�I�f�d� 4N��H�V�5��W~���H�&HC7���M��,H �'
+$
���3��N���T��H��0W��8�%(��Bw��~�π��
+G�4���3ܗ�4r8zSu5)���S����?�Gz�0h�e�����^�:`	��J=��ov(��6�[ �,7��?[�II�!�`1��O"6�Ap:v�n{�~-�� ����K#F�Yt��GL�#f�~��&�ͅ{�w�6B>���k�1�?�����_q�8��ƙ�O2�R��`~)`=��{�𞛨�����~#�-pWd� C�3'P)AN�$�Fq��Ԟ%HC=��,yq5nL�CJ���4�)>R)�uf�,p�����e�j}�Yk㭞+�5.��0��Ֆ�H�h�c��
V܂P��u�p����x\p:�r��hy������>8��SY�K�ao�p���ql�p2~b7��e�.�䍻��H���׼ܑ��C��!lG��Qf�w�H�A�k�������JU��"�q	�:�c�]��u��8���!�y�6��� �i����D��!
C��1�b[���C�Z��|�§��T=�E4|�Eǽ=�$
y�F>%w0��qj�G.5M
+F�bHq#7�D���)�
��ږ�n}�1?j*�d�J������or���\[W]_*<�~`�Z�����*$�.W�_�H�.N\���Vx*ߋp���6�(}k|��=8�9����a`���P���C�uя!�s6�Vs�v�ǈ�~(�{"&��>��9��xHw@�w��-6J������+��7����~�����;O�p�Awu��MƷ_,JU���;���r
+V܄�����
+�IRa�F	8�%��F7Z�X�2G�۩�1M�;Z��q|�L��L�qr��6�6�w�q��6`poa�8{���$F�[��iq�rW��:}�z\���X�b����D{�"���T�5r�n�����V��S�1�t�R���=���v}N��g��f1��d���9�Ȓ���Բd��"��@���K8��A�i��ٹ������`@l�/B���ФE!ֶ���Ah��p9�I�lp���h˶Yn�.'�u��?�f�z��8bHeU��Yr�O��������Bd<�xSp INr,�V|���8��L�hzH�ľ�C��f=�j��i��y���r#	�BOd�=�v��6�s�ʯ�Q�C��]�
��ݔw��Au�C�{���U�Y��ޛ�����?'c<���r�o��,����}�E↗;�n��ʬ"���p+t��S��%_����E�'� }���ڠx.2"�% m#J�Ɂ����1���^�O�`-y�"'���Oy�s�FnD�A��}���7�
 �9ejEl���י�=���|�s���ua��:�����#{yl�F�[��B��=��/�s��?�v�O*Yc������e��p���N�`,a	�_A�΄��7��.�|KFwX0�1�p]���7U�� ��6�C������P���bG&ȅ,;ˢ��,�?�����-j��A���yg���RYِ 
^emy���ڰ�@n{�7�R���Tr$V;ψ����ʹ�A��fmU�/�/q �����!#�K����T��p���Nɝ�޿��L��
^l�X���m�>x(f�yg�?��c�9�c��%��AT
�n�c�L�^?�����#ۤ��Da�mD=�0i��|��SO���!T��|��B˦����"�~p���j�?�T��j-J{���x_��ؐ$'��
m�z�Vj�{���*T%ϙ��~����=����@I�:׌mq_�Tk�`��C���6qgK!�d��W��=)�e���X�ۨ��9���{GǺF�y��t�b���<z,�;w�_�7Z!�[�
 ��@�����{�}��ʆ��k-����B��M�� (���*��c�C�;gu�m�0-��e�N��:�JӁj��^>	v"K�:�VBh\���Y7�Z�*�m����Ղ�~���~1�'�� ��b'K2#�U��)z�ڍ�:̴�7�����G��(�Ц4Tn�xwJ	� 2�ŋ�V�S62D~��Jc<�fhrli8���Օ&���w�W��qJP�s��_�&v��m
�jmBF
+oT����g�+�G-z
F_��2U,<`��p"�b�<����E_��~\�P��p��w�s�ξV�WC����zfl��a1�n�08���^���q�	qZ5���ʅ�o����B�/�$�se;��Kbb�����V�Y�?o���-�UD��
�����9�h{���m̿����1}N��rH��-3�#��7���k��֋9}ݥ�c���=�c{�/]��q4���NK(�C���ܽ��,�HSV��CrHSk�Y;�rX?<�#���?_=f�d}�ju��F�9�v��c��}cp�R���4�0RI�PZ��T��vV:`�2xü�~�NO�5��3
+��Od���W�÷J.��@����X6�3�+:���0���%�����{���hO��@3��|�@��C�g��"z����!��e���J���)��o�M�|[��݌%��W�V�[1�>=+���mS㥿��l|�򢏅`NI.�L�1:x`���o��6���Bu�]}��<�f�접�ST�U]_��}���/�G�G���7^��:g��g�/��s���^&�Պ�%�돮to�ۜk��Iko�ym,�lWM!��B��F
+�PyUF����x� �~��N[��oj�f��˳_��"|�<�"���.uF- �d��
�LCf{n�T��!Yǔ4Y_�(V�6j�c��]i�/Η�TB��܎ى�\�$Fp��n�9k1��!9!�&��s�uo��~��'�x���M�|�s��߾_w��d2u��o��?�1��.�.8m�s�h�:�̈����F"�R�V��U/f�5���1~�Kݧ{Nwmu��
+׆V*���տ�uf=l�`Eb�3O\�օ¬x|���ȡ����Jm˭4��H[��2�ZuwN��������3��[g�_1�`��e�Zz�<�-o���W>���w����X�E�p
+�C��sv�����^�&�lc��"�gt�B��$H̜�A����\qՇ�����/�!�ˁ
g�����������G���[��c�iw�u�Zו��v�697�v0�c\+xs=��0f�2x���JmC�][���ZU熁��jvn]ޮ��{)�������v=�=�\Ɓ���cO@�9���*X�8Y��#V�q�k4�����<z7RW��#/o����}�ii�H�&*0�I��;�"��?\��<��KZJ�U����E���t����
qD{����IDAT�,��>���ǯH�N.Q� �A�I-�ܠTR�e����w��
o��3]y�O�9<>:4����ր��x�L��f�p����e���^��n|�pğ6b�Y���Cm�Z�E��[��6����j�y����_&��Z�mj�+����̎�u쯫38��,�cOPwy��?��G��jޙ5\i.T8U��d�
c�z���fj��s��{!bi�%�I���)���:qY�I������P�<� Ju.$�
 ���ن�Җp��!����7�޲M�;�!��^���M�V��L���>"Z@�:xh�Y�\~���4TQ������~���8��iý�z|��'�<��8
����(��U�͹��������3�	-l���P������(=z�{E姂[o�xl��
+^b�W���������\	�IMN���̽e�w96z����?��|�SO=���0s�Tſ��Ɉ�OH���΀��\BD��W,.���SDtB%��kj����y��6A�
+!B�3�W���R8]!_�G|��|��'�8�Dt�F�y���9�$���je�e�N"j�XM����3����%�@t�5�G�kh��1��ʴ�8:�Be+����:�[�f)'�9ϊR}��� 쐞|q�3-�V�om=������hZ@UT��:B�t����+��D���N�D׸NO�ɫ\��V/+$����������Y,g�Z��!�{����φ��5�Dwm>]u��T8��"��ڔ�w���J��kP>����y�������]�F:!n���m��;&��if��6�?�^��{�42�)��)zd�f����r#��,#��We��V�.f�)#Q�x��z{?�������iJ��pb�A�"Ƽ=b�� l�^C��\q9����dc��N��T�����!�Y0�ʵ%רT�U�Ӛ*��ʽ���J㪀��˅��*���]�N�`C���V���ؤ�pj��kOW�r_��6{arD;�d��w�|�?���;����( [...]
�]�-{�}8w۠q(�P�X���辑_��q���W��%,ᤂ��U"F�F����k�F����;� ��+�NwC�

4[K�
+n�hWbD����T�1J�姉qԙ�q�$'�eR.�8J�b�b���(:
+n �*.��4���h8��9�~���~���5�U�oUWɽ�������{���sp�a<���_��Y�/
�F����
����8���@N]���ǷV�6]��f��˵o]-���h�l��R�<��G���y��i�K�W
X^9xX�ޭ�=[�tת�?�t�ÿ�`��9�~yt��\�R}�@u�t>���F���v��5b�(-��+�(Wc'��"�B�#ٵ����i��#۱s�������Gq̄��
+���]^V^�ȑ�^a���{6���������n�^Lu�h��G�<�wU[>:eHu���&��Wk{�e˲�x���MM[U��¶qU`��v/K,�C_����+���~+/��3�9��T�����X�#�t?m�3V��oh�����5A��0m�f���b�HX��wk���S��2t���x:����,~Cve�x��Z��@����[!���\�
+,j�H���������������'�|Ա'�yg���ǎ_�z�ǻ0��vV�<JM��iT�>\����jT�T,W��VA�v�_X�R�s�Җ��殜�����%#"��,%�Sض4�����h�5"r��~[0E�<l�e�y�z`���NU��f���Sʡ�0�w	�:f=�`)c>�����}CD��`V��"r�;�\��x�`!*���`�V�"�+we(~�q��`� B��,��#;	2���z, ��[FzO"��T�Н�	D�!s�f�f��9:J��u�Eb�~Hۑ�3���zo��@�u"'�}cƎ?eȰ����"�Q7lx�~����aі��8�f��g�5��g'�˺��[�yt��I����6�o��?�j@CEyi�ƞ{�{v�����'l��c�W�u,��?`��(y����s����1���T���-�h���M����H��^����i�B�\hO��"2KCs��4��cn�c��yX��0slܑ�R� [...]
�ß\�q������{s�5�,�e���ժڬ�{��pk���G��b��{C�e���1-�@�g�~z�JF�b��Q՝~�jW���h����ǾC�#�Wv+"�k�=�J�
+ۡE�;�e��2;(Um�w�WF)sd���ݤC�P rCw�!��Gex�9VF�]
+IUoP�!�:VUOT��T�&U�������g��ߎ���j�҆ŏ��x|���m�����}�-=z��e/<�䟿[QYy��G���_��l��y����`;��oI�Ql���4��1+�J�+2��X<�/M:�e��
+lG����KY�0�ut�4�h^�F��\WƗG7��g`�t pߦ\#SX܆��r�Y`�Y�:�/�{�aP�Bl�Fϡ2r���֭Y���#����I����[�<����sƣ��F�d���I��ڋ�i|��T�l������� �i�G�vbJ����.��9���+����Ua��d2��G7#eF�s����]9�>�q�hJ��x2�V�2�*$�yl��`�q7���SRZZ��w�X��'5K�}7(-+�;|��c̛�v��o���N:.�u�~������6b�7t�B��It���C�F��_"�ص���I����5�~N��=l�L�!%!iN�ƗWF�H���B ێ��@�Y����3�Șz��8)�#]Cg#��+"Og���'�~
z�;;�ϝS=f�a�^�d��驛�QQ��T9�Š��eFb��� "���oc��.q��f`p����L����"��5��+�u"�F����`/o��u#{=vWv�oB&���'������#�27�O�ҾTz]�ў�����
 �\U].";D�J��"v)��)����"��\���s�f�=	3�V��Ǫ�VDj����a7O�=@H����m�����Nv�!��2)��9�gvC�@�ɵ/�8
+3%����X�tFKKKk}��O��Gn���%��r�*��ߘ����GD���R�_h[�b�Y���O��1�����N�?��{��{������Ŕ�ڟ�{cǤ�8��A�����U�#"���^T�#��v�0�������Fx!����jO��=�j��w#㉾�0sm������>@�d��vP�\�j�P�d�`���(����7;:MQ�����E��tp�R�_�4��3Q'���_/oL�����-xyZ�4U�3����i�o����XH�x���j9���I1��T�L��5�Ɨǁ��z~�G��
+(
D���SG���T��{]l7	�簭�2	.�<�Ȏ�ꏀl�/��֫� S$���fv=su��KB^!�G7#���16.f�f���G�⢠�Z�e�}��!�uy��G)՟�=��Y�lO��/�l�"�2<^��d����BC/�C��U�H�$hb�p�T��@Dz�%,^W1�G�EdX��"�i$�}��tp�̩����nK�"G9i�#$�+'"�NzM_e�BGI�u����{]��B��BJ5��f��>
x.H�[I����H:<��ۇ��D`km6�,��
?�l@e�~i��<�|�Of��ܥ��b/�F̴�_D���5�����!��W�n�m�[�gXJ�"�_"#"�bԍ^"r~�%A0>�P�U�^U�s0��R��(��/�(�e�
�{wbNa5"�WU}��Fb|����"�SU���o0��&�H{1+L���Y����]�2������U��ܩ�|��(�"0�o��<r����~��(m�8&s�i�a�T�"�c��$T/"W�Tgvv����X�Ƀ�7��ݠK��i�S��9W}����
 ��a?���q��c֯��	,�Xy�H����Ū������N�x�ۋg�8r#f8R�l	�ͽ����g�e$�~N�<�o�X��H�x�{U���]	_�Rn�Jh�=�p~��9�㦉E8Sl),<i��[��6$���W)�X�L�a&/miܳ�,��X8���+������@�@�ml�&)�ߒ`��:��Tu�/�$��:U
���2�"j��>_+�������Vy�P�w�I�Y����b}�v�I%�/]{GKհ�L<|G�����
���D���^ޝ~G>�U5�	�alN�I����BJ���Ev�P �����N��v�
'`�0>
D6"��Ł�g��.���҅Ty� w��R;�C(	���[;[E5�	����W�L�5�u�c���za�6��F���芒�6��tt��+��<���X�t8x٣Af��Xd�΢K�5cd�w�K�O��3�BJ"�F�Bo:j�lI��$﫚6�Ύ��dbk:t����Q�,��3���Ʒ/�%�u����!}M�R��N�ċ��W<^
 v��Z0���܍�K�LJ�G��ЕRk��!�ӑ;���T���/�%�V&2���ƾ(����zH���y�ø{�f�}yGk��;��k���+M(��pL)eJHI�x�'�hx�1���T����)�$"�Ed*�e� "�PضADn�ӱ��W���2#0�����-"�z]10ID���5�Y�2�"���a���[��;[�SD�h��*"���17�W]����8ZDN���8��en�s��̟��3E�zlg�=
���z=�H{�0��Joo����1�c�����z]�Y7�x�0DD.�`;��B���>�q�^^9}
�R}K[�i��)��y�R�;S��1b�l2��C9����m*�J�E�%�<:RԢ��M�b�%󜿆���}f��&_�c"�ճ3�ȥ�wGx+�汘�@#0S=����J�kTu���a��=�,g��ֶ���j�D�,LQ�X���n�xw`����Wʇ��H��X���������3Uu���L�����M�;S�φd]Wt'�ڛ��G���k�UM�
 ��>�0�u�7�s���P���s�V2�Qu��IEND�B`�Yimage/png_http://tika.apache.org/tika.pngOObplist00�fgX$versionX$objectsY$archiverT$top���"()012JKLMNOPQRSTUVWXYZ[\`aU$null�	
+
 !R$6S$10R$2R$7R$3S$11R$8V$classR$4R$9R$0R$5R$1���������#$%&[NS.relativeWNS.base���_http://tika.apache.org/tika.png�*+,-Z$classnameX$classesUNSURL�./UNSURLXNSObject#A���q���3456@WNS.keysZNS.objects��789:;<=>?�	�
+���
�����ABCDEFGHI���������VServerZConnection\Content-Type]Last-Modified]Accept-RangesTDate^Content-LengthZKeep-AliveTEtag_:Apache/2.3.15-dev (Unix) mod_ssl/2.3.15-dev OpenSSL/1.0.0cZKeep-AliveYimage/png_Sun, 31 Oct 2010 21:50:23 GMTUbytes_Tue, 13 Dec 2011 18:55:14 GMTU22216_timeout=5, max=99_"d1b54d-56c8-493f0adbb25c0"�*+]^_NSMutableDictionary�]_/\NSDictionaryV��*+bc_NSHTTPURLResponse�de/_NSHTTPURLResponse]NSURLResponse_NSKeyedArchiver�hi_WebResourceResponse� [...]
+V�J�q���YO�C�L�VAMY�
+U�Q�P�w�{}H������m~����_���!�t�r�l�T=GQ����x�|�N\�n{8Q��uNNN���lUH��iï�����Ny/f3f��Q������̙�����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������
 �����������������������������������������������������������������,�d�H����*\Ȱ�Ç#J�H��ŋ3j�ȱ�Ǐ C�I��ɓ(S� [...]
+J��ѣH�*]ʴ�ӧP�J�J��իX�j�ʵ�ׯ`ÊK��ٳb�]���9m��-) ������ܿ�3f�a����af����#KL��B9"�,��
+� �����$������X�X�3TH��4<��D�y�^~6�@�_?N(��'��8"0E�߾{�@�x�
���;����3L/8��FH���@\�~�y�G�BĶބS��^_�EBlX@�N0�	P,�@-��x�\pXa�ET�@��Uα@�EW�@s�p�QbP DM����9����y$(#qT0��:�Y�=�1@��=v�����1	�	���Y�I
��e�[>�f�����fp�=
+d|v�qB2�Q'gPpg�QJ�do|�)^p[r�F�H��R�c�*k�?�6Ҭ�r5�kY�9|�ZP�_(�&�1�N���������yV�!� ��C��k���hH�N��V� �=�ڮ�0�ݱ�&KB��,H����7ԦZ\��YA��
+%��i�H����\�o�f��"���6ɬ�(����U:@Ɨ�*�`.�|A8������E��玙P�A��Y���i
+�k��/6�{��R�"��,�o�>�xm(�F6?�Q��KђF�!��=�n��O��iZ���Z�2��v���<�r�-�a��6r� 	Pqd��.��>�qt}	V�`����H�u�&�'�+wM�.N9�x ��A. PD�d��:u>wty��Y�ں�}�|W9
V$�;aD~|���d,�
��ε����o�C�0p /�12�ò��&�!��5�
�h���^���N!S��+� C�Y�c@6� �t@,�7D|}�`|6��0>�A�K�3�!�%v�$��Q���]Z��լF?��gxY�Ղ��'��͟�g��`y�a��0@�@���
+����`x4еK�N�W��v��jH
+���\c�؈ooABy:��D��M�@�p�S6iD��~Xӗ�@�P�`.����0j
�]��>�y�	@����č�
+�O�D��i��2.?�����⸫e΋i}��g"�0/"�Nc�����4�ۅ���Nk�C���2��JV��R�^D����x��e.����`y��E�	�HPi�C&6�IE� ڜW63j+�8�b�(�<*Rw��.<��:�@Mg�"��`�M���lW했8���k(bV��	3�<�V"����.aѩx�n��}��u�k�B�:fQ"�V'�MӕTHW�2�H�|�<�kں�  k�#]RA���q�r��?��T��n�$�j!c���g�~5Ix�SoFPJU�����ٴ���U�f2RCri�&`�	���F�z1j��/��E	b�� 3!�M(#GJ���Ls��A�ۦ1�0C��5/,���y�Lx����/]�
+>�n�J��d�y�v�SY]�ݓ��/๬�1s
ȓ0��E�����R�B>B�Ḧ́aTl��@"���؅IgD֋>ݣ�m'E׺Ew���-��$�^��2сs���iGЁxD^��H<*D jZ�M��l����30�T��R~��%��W�N%�|�8��;�!�l'ދ���G��
�m�A�kCD��6#DX!%���D��	�*���D<�y��'��D&r��h��x�lט�2�Y�qR>CE�,)�WP���,�m:�Yy�b7�� ��-AF�$��@}� ��	�f�eK
+�ܟhp@��������h&q`8<�:�Ł-�(��p?j	
+A�lp��-5
+V	�Q�k�������.kmdbrN&����˕�4�Ìp��Y��>�s�7�ۓy��9�n'�v�je�8!�nq-+�1��W�SZ�y�I��$����*]��G�̍&F�셳y�E?�M*N��K	��m2�!�
��'-iA��N2��Į��p\c7P�Ksx�T�!�.W�3 4!5XA�Ό��L�xM<��B��E�o峿"짛�����)n��`	����f�6%'I^vJっZh��Ż���k�"��O	(7(N���-W�~�/��
���d�%W���[�y�����x�?���p��%�	l�R�G/�O�T��[�##��;5���iǵ���?��5�%`�\E`@Ѐ@��
` ?7�Q	7}:��eyKc1}��pO�1pR^�	��&mZ-2-��G�����d�WR��r]%�W n n\�T\�0�D�37��v`R� �}F�ychlbU.q�x(XW�Ey��gs�B�+pM�E.`[��4<���W�MBSEdJ�wd�B�\�v%�\���.�5U�E��0�'�t?�,�
 �|8���agUVQwy��+��jp�a"dHo����pL8��(p{���%Ԃ Yb�+2s��i�W���Z��刏(�"8��������(�X�ׅη����!�fd~l�g�$��g(��y���Ȋ�G�G�!:5�����0�䇾��ӂO�1�&��Gr��$g�I6�$���׈\@�JtY��Op(P�X�|��8����>�ʴ`����6\-V\g�@�2|4<�F&�p!s��LC��rp���> [...]
Ӈ[��R%J�h���
+O�%��r=�/��]��0��ap�Hpa��|� ��Fl�tVD7�����Q��1:l�7WRa����	r4Y
���h�h 0���%#����A�4�^�s�0��3Zi�/��7@���v�|��M��L�
`��`�R���s�1Di.v�Zn��ӎT��[Ϻ<����gU!��(S0@[M�|?����è MjX��ci� ���sgZbC9�4�E��ʪ�x∓Z����It$+�Bx%�G���@+1&��!�*h�=r
+P3�R�?K�W<���.��� M��%Ү��!9\r%B�c� ��M��
��v��P9I�v�61}��or����1~���j�i�,�O�O��dZ��O�hTg�M� �f��j3��]��~�x�ɹ����״N��Qk�G�RGpjRfPY0O�`�JO�0��D���%�K�##�jZ�C|���]��v>�,�!�i�}*�Tq����A���G*����{xhO�*:�3�� ��z��(�P�K0�J6S��0K�V�y2����	ɻ�[���; ���ې��3��˭�� \��6���
+1�+q7���.	Y��5
+���B���� �w��:!�\:��!��,��9�l<�'�L�-l��[#0<�>,���	�19|$����k�[�qʬ���7�o&�lS�!��H�Y[��R�QQ<T�_���ƛiœ��P!�MQ���h<�G�p�g-�W��f�{�r�tlv�oi�%�x��ַ��h�+�[����vQM�UGSxiu�#��(����.�ɺ�ʷb4
�+�L��l�T�Į�uV:kB&z�AI]�(y!�8$ak�(BB�"!��L#�U��(!,$g�ZHM7��͠Ct��̽�[�&���֎w��ScBw�V����g�v������U�ᢷ���ҝ��&{�&�	�ڤHT<G��i�p�~��d(!l]M��a�IЎ�]��U��k%���B�Ѿ�A��V�1HMw�.�ɭ�U�8y�l�|<H���-�A	�nu�I�{fpF�g�M}pڤ��|�y\�\&�B�e��(�,�VQF�\�1G�i�$.�յ�oZ�U����`�p��x<]tV��K��;M)�).K��c2�.��w]���H٬^����
 ��Q
��Tg��m���<tz-���ю��G�eg�Mn���=���u�ڝ-M~Qg�؎�˦]���p�l���� ��Պ�ڴ��L��5���-� ����yx<l(�o4�lfi��=
C{���Z˶m�����ܩ��\��=Mϭ�<=����$��J��eXg�-��]ڌ�=s��h�B��k$]������_]���x�ݮq�����0PV��~���fș>���%���.x㙤�ߡ<�h�=��!ʑf�(�7�oTR�
��Wdx��E�,�䑝UK~t�	�9�=M3.����)ޥ\J��4��W��>�&w�\��֧�����X���p�mk�=�'�6^��M��Q��
����.�nN�{��1��o��m�>��-M����e��M	U�B��G>�E~�D�bF�瞝����W��rBc��Vn����^Ԭ�S��p���ԬRM=�D��!�ݧ֤�MRjv��(��x�U�M�LY���N������"1�n��n��ȥ��������>澚o͎k
 �.m�Чn��݇�����������>���
�@ԅ��7^��=��A�N����]bym�\�B������t1t���&�����.����������%�����wŬ�.��!6��+���dM�96�g}�x���
+�n�ju���w��̥�ņ�߲�M1J?${��-�6��%R����9�t�P�Hd򧀾H%�T��c��q_�����NL}~ȉ.�?/�R��->/w���{Ï�tQ�)�N���١^Ժq���|��!�� >C��95&]�vz:�_�?`��(�����l����a����̌����:.��h�VB��\ܜ��L��(*
+R���~���(������|��(C�ʬ�ɹ�������s����g��1@�
D�p ��
>�Q�D�-^ĘQ�F�=~R�H�%M�D�R�J�-]��S�L�5m�ęS�N�=}�T�P�E�E�T�R�M�>�U�T�U�^ŚU�V�]�~Vlʀ;Yimage/gif_#http://tika.apache.org/asf-logo.gifORbplist00�fgX$versionX$objectsY$archiverT$top���"()012JKLMNOPQRSTUVWXYZ[\`aU$null�	
+
 !R$6S$10R$2R$7R$3S$11R$8V$classR$4R$9R$0R$5R$1���������#$%&[NS.relativeWNS.base���_#http://tika.apache.org/asf-logo.gif�*+,-Z$classnameX$classesUNSURL�./UNSURLXNSObject#A���|B ��3456@WNS.keysZNS.objects��789:;<=>?�	�
+���
�����ABCDEFGHI���������VServerZConnection\Content-Type]Last-Modified]Accept-RangesTDate^Content-LengthZKeep-AliveTEtag_:Apache/2.3.15-dev (Unix) mod_ssl/2.3.15-dev OpenSSL/1.0.0cZKeep-AliveYimage/gif_Sun, 31 Oct 2010 21:50:23 GMTUbytes_Tue, 13 Dec 2011 18:55:14 GMTT6338_timeout=5, max=99_"d1b54c-18c2-493f0adbb25c0"�*+]^_NSMutableDictionary�]_/\NSDictionary��*+bc_NSHTTPURLResponse�de/_NSHTTPURLResponse]NSURLResponse_NSKeyedArchiver�hi_WebResourceResponse� [...]
+$'-6?AHP[]gikmoqsuwy�������������������1<Ffl����������#1CF\j^�%&'(OJ�����JFIFdd��DuckyY��Adobed�����������	
+	
+o!1AQ"aq�2�	�#��B����R3$bC4%�
+rS&c�D5�T�s6��'E7F�򃓣�dU(��)8��GHVe*9:IJWXYZftu��gvwh������������������������������ijxyz������������������������i!1A�Qa"q�����2��#�BR	3b�r$��Cs���c%4S�5&DTdEU'
+��()*6789:FGHIJVWXYZefghijtuvwxyz��������������������������������������������������������������?�.c%v"��W���z��W���z��W�r��O��Y6u�����ɸ?�1���>�?宏����.�[�z�*�=^�s���=^�s���=^�s���=^�s���=^�s��*yS<����W��3�B��,�?C�K��ӼO�b�������2Wb+��z��W���z��W���z�/���Փg_�(���[�L����ʰ��Z��iO�����7�r�s���=^�s���=^�<C1��]D���S����xEW�m�mRA6���sթ�:z�z���Ǎ�9��I���z�^���z�D�$��3����</O�����z�z\�J�E{��W���{��W���{��V�����l�����~ɓp�c�1�V}P�]�)�
 ~]^���U�z�^���z�H� [...]
+����H�lLZ�##$��Bq�j���UúHJN��4��קQf��Q�"$�sfD��I�˔�|.'��V���a�"���s��9
+�8۶~���B����Q
�Z����Ǣ�L�Đ� 7�����*G���z��W���z�>H��gY���O����=^���1��^���z�^���z�^�չ�'����:��E��߄rd�����~U��T��G�JߗW�jaU��;��oX�YDsep�A�=^�|�z��W��?��ú23�����e�B�����hK�����%^�FG5t�F��؍�H�w+1n��S�AISǧ�������No�)��L�:xj0ʴ�(G5+=<��A�ܥ�I�n4X�� }�X(m�;�����kCKQ�a}�'�P�OƵ��&m��=c�&zj�Ve|͏cԴ1�<��U����x�d��ë��&'KMFi�s�j�w��m�+ZR� N����[�̷w���"��h��*wHB1(Z@�"1]u���3��Zʕ���4Z��O-%�TuK��cr͆�<'"*�Ɣ|�z��W���z�>I��_Y}��S����W���d��WeQv`��M���\<������5�V��#�c��{ۛ [.
 ..]
+=��s��vW�+�=�^z�En=�VB���w��X���a'� ��&M�������X	�?�-t����tG�'⩞&�|3�u]X鑧��Q���
+�Y��KZ�_UM5DZJ@�5U;��m�u]g6��*R�5���jُf��
޲v�zIQH�>�t���~>�:s�&�a9c�Y����
z�����L��RV��3
+8p���?A<��.n�f&��$�K�@Êm+���|����U�����2�waaߞ�UP� �_��p\6��0����|j���L^�)�L��M]GR�)a�1� ��m8���I	�^�@qD������'��̽C錙ϩ٪)r��}���̀aU�i�X�,���H�ӫ,r����Hm��Ľ7�B��8ޜ���İ\I<
+�	�:-�_(~%���j2,��MѼ�Ӝ��|�P)��v_���8?�<��;S���G�T�ku�f"�xu�g�9jwn�U�;�(���FÍo'�&��
�����$����E�7�����*�qJ�j�^���z�D�$�3������������2Wb(W�Q��
+��J���AY�Z�ưL�Q4j�e��o;�����Wx�֤�`p=t�#t���h��aJ):�22F
+I�fڼ�fΙuo�دY��^]�?��l�������tg�/�q����
><��ǚ�aO��"p�î+�]��Y�E�#(��k��z���}�������n����H0�Ѽ��#	���ڶ
+JJ�>���4��@W˖ V�nM���9�W��*BP�3��=+)sN��}��o��z��wa�&�)
+.N��8�}6��"��A��y�1~�łTI��<x��-$���z�v�=�����Vm~B@(##f4A��ћ+q����E�z�TAmq��bS1����p?R��
�0�l��hp�:x�iў�&b�Ī��nl5<'��R�ʒ�¦���nw}�Z���$��O��&M_��	�b���j�*�:�����c4��˓l7
+Z�;Yތ�6�>�;�A��J�?*�>�
+��e�ˢ�����վB�zU˧.d���1N�a58;n�L.�S-5-TpDhइ�Y#���v��H�ȪT�	Ps���e3 l$�D|kf�J��I:{�Qtj���Cn�O��X�<P�F��\F����������D�4Ӆ�$HP���GryZ�S?��#/���+�x�<�������t����#�RJ��,�Em�b,�/����w�5��,HBRqJ�$O���w������-F1ؔ����(�Zzg��'Ei����3�%�RT��4�����ʥmU]T�"�����	Qs�{���ƻy�i��8�"@���B|Ǵ;6��d�/Γ4�A$�S�e��C�,�]\�i��~�3M>�枚���T�|���4��
++8��[�_~�����,�*Qؑ���8�S�=�5�4V���WT��<*���ь{	���l��:����T2�!P���*�-N\���K�,N���ȥ�n3|�ܶ�V�mA��z݄c�C܇:o5�n鿵Iˤz)������}$�
��6��9;:�Lh�wU��������UT��MT���(Ve����'V�y�QO������9��!@��NQ��F��t���
>+F�{�������x�GU`�_NQғ^sUZ�=^�M�?��o�*x^�������|����؊�=^���g�V9�>#;�䡻_�
+�Ϯf�@��������ª��'Mr�W�Ք2&z���8�!-u$3�4��KGfuN�n��nKk�p����|��'z�̀�T�	C�p8l���.���~"T!�M�e�,�SQMK=C�H%�Ì�$���
+�,m��#펙�+��}n��p�}zAuO�H���Kl��뿬��c��)��K�pA�K�I�X��|M"uU�0e��p6��q]�L��ũϱ |(�!����[6lH���9��fN�%D�������i3g����>���z��Q�(���Ӛ��p��p��*dz����gQm�k��u;��pN��>tGۈ����I7f�p��	Rΐ�&h��礹��OZ�<$���gj7L��pJ�����h���z:�&0L��B��R�ѱ�dڜAF>�T����,m���$/�3�<1��Ɓ���3���=wZ�o�X�!5VT��TV�Y^���&��=D���kơ̰��V@���7��S�V<$���?�׶�'z;
+U�{�H��~��]3�=\m����ӆ�Ty�GT���Ԯ�����5��{u"���?�W�*B$@A%ƨ8���l�-ZP��ۡ
+���F���'ew{��8͔�W�J���U��	j1�i�[:���:E��ε����_"bX�eZ�׾)KUD$j����U4�U�s�
U��kv}�e��eh��@BI�Q��@�F���®��
����̪���%pF�((�bv�"1�Q���ĥ�5x��f�1P��"�t����]Z+4��v��X�m<$�6��>�d�U:�V�g>�]Fͅzz�A]�J�j\�'�)PĒ`)m��LԜ����e�V���=D��Y���k%��ߐ����:j'����r%!�6B�sh�u��ጲ��IF�R�Iڣ3'gT�Q뛶�amn섬��:D��La�(�^�:W�=E�c��²WI�Y�X��1(��f>���C�P���f3����lQ@f��/,Mڔ]�s�U����6l�)ާ,.ͲR�k��N�v8W�6ހd��K�z��<��s}}>��c_9���
*AWN�$���%X*�����Y�7�5���6�q�SCT���#<H'f��v��Ŕ�L&%��l�n��;z�����"uvLF9�D�8fz���
 $±�;�J�Gr��a$E����ߐ�s�.����p=#��I
ܥ��î���5z�=^�O���oo�������?��W�K��]��s�꼏F�c���#�����7k�!_���ߨ�|�����U|��sY�����[��St�z�+*��?�ܸ���PQ�c�	?�S��J�j����ʱ���7�����F���T�E�G_2B�8NtZ7���G,�RQ�U$l�v�M6�/����6{֜B���*
+���d��_{|�l	�R	�������y��ɂ�%��W���4��z|n�&1J�II��0e!�3��qa��,܏
���#�����7m�m,���co���8t��^zs�hj��ϖ�ī\��[=B���T����x߃�Eb�ݐa� c�j��/�Ǭ�_W��J�u#b�>1��>ᐭ.3A���V`��*����Cu�X�� 6R\�v���m�Ύ�;bj�"��Di�+8�;���8����S��3��.\�
��s
+��l�ie\NbGA�m�	���+����O6�d]:D�#�La@{Mf��f�z���4��ge)���C��Y�2�Fc�$Pc�Y�K����@�SS��;�vx��3��5f�`�����v�u*$3JdbT|�6	<"��#0���`ڐ�����v ����4U&	��h�Z��+3R��KUR�v襎E5�G�L�����%���OhW��U�`%�)r%JҸ)RKo���a	PW����\)��.�U�:#`�փ�$E7�Z��X�XŲN�Uƪ�â�q'�/CZ����˄���-�w9�	���
7��n���b�~�g��J$N�=j�7G����*���#�@A����Uw�%�f���Xj����-e6r�OSW�`N(j�c#��Й&�i�*��������̳)B�W�(��
 �@���*m��g0̜M�'O��:�A$	�$p�CVbZ̧S�t��
�gjyX��	5l@�I�
+d��}�V����{�-��Ϩ�<��Wr�8F	$%#�D�(߳��l�x��r�XB���|)�b@j�?�;yG0tK��]ru_W�2��\�M���J,�³�t��:�Z�Lu8h��4B"d�U$���e}�#zѯ�J�����ջ^��a���+�R�RV8�1Dݭr�{@׆5W>z�D�$��3���uO��?��z�_=.c%v"��W���2���O�*��7���aҴf�M�*��F'Z�E�7�;��6��#��f�M)^#�<vV����o�o3Y��:�IfU�"4U� ��D���~�],�P��\���8��&Z�.J�9"�l�$sw`4���}+Y�'�S���]g�+֖(�ꔂ� ,� l�jޢ�6޼���8�l�xP���U_�W�vy>O����}\�{�O�:GEAg�\������*�hԝ���:vu��*z��^^�k�<����#��,Y�QU-��Ƒ����E�ʄ����)�V�v�ρ@ch�����v-m�� [...]
ꌦ�Q���G��cP�a�C����t�5~Q��@��8�`F���xP�wst�_0�Iو'uu�7�٫��pn��Ty�>B!ű��M��X�|:!'v�x^���U��P�%'��i��yB��7��t��J\�
+V� `�
+��Tɚvea��[��f%H;&B�q�"L�ꡓ��K�8n9-_��H'��ψ�Ua𪊀�O_S[�8a��i`�p�̜q=���4��0�Жt0�@� IV��fm��[�6L�;��Yu�T����pڧ?]��x�Sd��C�iZ�������k�$z)L4�*�IJk!�����P0����vW�f����۩eGHTG�g�`���>Tk�/%�m�i�U�p�q&DL���3|�&e�d���~��L�d��#�9�ƪ)+)i�-|/&����Uf!���t�0�v���i�%����ĝ��_M�a��R�z�5���Sz_�S�ϭ�C=b�����3�9sx�DP�������02�
+	��{�#��.b��u�"A�'�²�+w��ɞ^IHP�N��D�u}0�\>��r�p��e]r�Ӈ=Z�O�?��o�*x_�_��z�_=.c%v"��W���9t��Y�"za�4�
+���R(3^.�(������jm������䋻�{.��Z*S������b�i;�{aw�PRGJ�����A?_�9��ӟJ(��83�>r��n9�b�R���m�R4�W��e�=mb-���,M��^�z@�g���8�ZKE�T��	Q�B8d�N������^x��h�R���1����UU�mI'�1y�+��q��	ː�G�	�xH��-�}[����/ݔ������m4z��r�Bz��t"�,WRtƟ
+�1��1J�s��
R�r���� �@���g�"��n�'�H�I�V;)_gVW9��+��A|�%(V���t��^N�	kgOH�4�<��k���+��.>���(`8*��AW��U��`e����~�*-�ۙg+�B
+Є��xa��a.�^;u��>��Z�2Td��u@�����y�g�:��¹p�;��"�!��i��U�p�m���o�{�/�|Rz�������o�j�1����Pb;G-^Q]4oN�d��F�J�z�V$QȥY��z�Z��>�t���ӶX�X����"�]d�{ρe²�M)��'��
�l6�Xrh�g �]ʇ�8z�O��Q�i��mZ��J�<����>���`�؞7����;�p\ŋf*���)�LV^��%U�p��H!h"�<������Usl���)���N0q蠖G�7�R-�BIJ@1�(=~ު2��6�L׍����Ӻ>������RnG�P���k���K�Wc9%��ś)=!oN3�e�v���0�ۤ��"@;0�'=Bò�U:��eϕ�����m��t�+K�c�qW��(�a���A��&���卸����)G��̖��kU�%¥����T����O���U�pN�z��6���%/Vr�%����fa��&����U��X�
+��5�����-�����'��Y�K�*���gb�{0R}D��9�³(`ެ}�L~J~��0���w�s̕qS�2�4�j��!��A�~�R�fJR��&Du���xWO~���do��8��|�6�4K�@9�&�bE;sU�$�#�I�f�©����?��W�O��]��s�걏O�|�NG�>�p�Ә���8��͐�*��569��OF���$`K��l89�3�Ym����d�v�X��gy�aq~�Py���	(\�i�N�
+u[�Pu?�WE�{��<�兩�%OQ�6�N
�Ȕ3y��YW�$����Nk��f٦�܂g�F�C����Y^qwv��В:�������1u�}:�}&ª���v]�r�}&��n`��J���lAjd���ۻkp���ܲ�@�ZJB�ă11�h��{�7z�`�	�ۋS~Kn
Z�O�
+=Lg��{�:ӕsI�����Ͻ2���J�LF��A=TUaM4�8�,n������WD\!P���fF;"���d��;j�jZB�[t)0R�HIO�$�ڣ�~%K��O��4t�C�pd�Y�n������1�6ʑ{��Z�>䷒ަ�-�_�V
�����Jb��:\Kd� �u�1l4x�zI�1
+���	ȕ��3Q��7��ˊ��0_�3y��|on�5�uw@�F)�M�M�$��
f7�Fj"�H�+���_x1Pv�$���Z甯Qܝ�C+��T��ѯ5��������7�1��z�.��'���*��>��X�>�o��]��$�Y*��ʱg������� '�}樋��z���2M�L;|�_O>�&��0VC�xX��J�ڳ���Q6�[m�6��G�CnZ���*�*�֚E�f���1�G�D�@ {�g�8u݄��ET�°�]=�g�Qj�/�C�IѰ�I��jguUr�lr��h�H&�w�q�<��-�=���d�����P��6݋�h
��85Ɣ�** � �~�����������������U����1\/.K"RV�rVbT��JLJ��B�
+��/lO�έ��9���(	m!� �diĬ��Y	��:�ٴM�xx�Q'	�*pÀ���雩k�цkΝ �g�^�+1J,�YM=FU'.����1LpH��k�w�8�����嗎-ԍ�$�"F�~5����Y˳l��j�S�B�B�IT�Eo�I�]��
����M`��:�U�$�#�I�f�������z�z\�J�E{��W���{��W���{��V�����l�����~ɓp�c�1�V}P�]�)�~]^���U�z�M���VRװ*Gߧ=[k�;ꧪ�9z��#��b?5�5~7�L���3O����-czQ���3�rm��D#�[O�'�X�����Բ1S��(;�����pj4�p������S`�0�n�J�u�Mr�6����'�E��m9�?Ӭ|���`�����2��Pɇ=���u��؇�=���Z��B�#Tp4�>+=mf�CL��%3�m��(	i��m�(^�#�tm�J(T$�9���@r�V0�0����Ҳ\�v���
*�(���G��j\
 
���ҥ^ˮ��}�����	�V���_����>
+����?��H�qմ�)�����O������*�S�2"�(	Ý���kp��
�ȓ𬭱̃�J�ƭ���\�fg�9˥��6!��d7Dó^'-"�U�)���f�B�Ɲh�K�w�s�1�چ�#�YH�8�Q'��L�7�</O�����z�z|�J�E{��W���{��W���{��V�����l�����~ɓp�c�1�V}P�]�)�~]^���U�z�@�����&�S:��U
+<'$`��)[9��EO$��	fP�x�Xک�P�D�(�i���HS��A>����ӃPTgޠ�fz��d����6TL��:8bUT;��O3���28@|&�Y�U�-�'	Q>�}��J�K/�;=b��c�c�f*�Q�8dR���-{�K��ݿK�7j8���-�P�y�~�/ˀ���v6�zh��	p����A��}O���0�ȠHUP���e�$�UBB
�kQbp5��_°��D8��	g�K)�f���ln�^��~)
+:b��^�	'�$�~�sE>c��qi��i_��J��V��7F�^�_��h��+�Z[��4�Dm�95?�9�7�ݷf�?h���j���7�U���}Xtn����~�rN\�6L$�t؆P�UQ��LR�F��nC���H 6|'�wq�*�߽�)��Eo��칃�͈axU=Mi�W<P�o+����Ac�<��EJ�z�D�$�7���T������z�z\�J�E{��W���{��W���{��V�����l�����~ɓp�c�1�V}P�]�)�~]^���U�z�T����v��~�*�8+a����c��^'P�iҚ����&p
+��h|�Z�ް�ܒ�&���n��GI��P�g�q ��>��������i���
�)�Q��ê���VaR|�Q�f����_h���3���2V�F� l?:�7m��̥6�	���
+фi��؞)���E?�e���3=>+W�$4�Պ4x�Q�E�Ac���E�"�̳V�j��^Z�<OF9�ɻ����yPn�ՖҐ$�TRęWE)�sn`�F�:c]�i���*f����T�L�=ꀂ5��+{x�#�����h�)j���@�hw�B�p��q��P$6I'
+PRzq1���
 ��W��x���m�φF(�Imdj��deyI��]��N��R^Z��;��H-?��{t4�B?�]Ip�W����*�͛0̹wLT!`���e��̍�"�R�ַ����g���h((�'�#l
+��/�+��S6b���m
+��b'�ʲ��X�����6�b����E~E̓	
+ASI�CYKM�;J���K{A�Ö��P�N(�;=����r�c�� ���= �A���8���e�d�Ǒ�}8sU�$�'�I�f�x]���������s+��z�J�/"�s��2�F1���JiSϘ,p�St�<�U7m��/�[eJ��4��-@�	�8Lc�(�=qv���(�����T�)�+��L�O�Z)ܡ.
��Uч㲜o1me!*P$y&$�H�+�.�`x��.)��Z���j(�Id�	T�������u������*�`��v��n���H8L�v�d��O��]&���U�*CE8]�7U����my��/�'�K�3�4읣gN��'��A�K�7������K����ʰG��Z��iO�����7�r�s��M�l��f�6�ǰ�|Z�E��u0%DMmE�Ee6=�灌Ez�����r�^��ժZl��5=<�De�T��R˄А��(���`ێ�&�a�Ncp�h��T�Q�gN�;Et��Օ�
@AK�Iԭ�i�
+3G�̭I�_
���

E&S�yZ�\P&ʔ�e�a���9lYJ�Q&;M�A{�rj�L��[���QJ�m�Lt��;��0{C��@:����	�dy�PM��;Ȣ*X����i�)�j�=KD�HW	�@������:7�9�=�Î�#``z��{/����~3��LcÂ�<4��n�	!�j��YTmH�Tw"ݸqęN�A���{MY����s�o�"�`�2SOI[b�y��;�E�	�#7�b7 �ۊ�Ų�H0GA��iqa���$�<�eRZ����*�O��a��7hp��
+�l�����s�}�ܛ��}ĝ^d~ʟ�\�1��4�qW�G��^��,�jEf�p�[�	h�8��������
+�	��n�>H��oY��������|��؊�=^�i�	:�d|G�tT3���0<f�jt���!M{Ź�@b��JňR8og���6Tw�ɱ[�E@��8%#�h�*_R�xf��3�J�B�����I.#.
����54�k�A-q�s�!d���u���l��9@�Z�S%Z��Ԓ�"8���B%%�o3P�o
+��� �筃�*j �_%��}�4|p�-�{���� �<��\��JAJ�;`�`��&�}F돩��L�C?S��n!WS��F))��&�k����ݿu�qn%0y�
+��2���n�jĨ�0
�p5���'ΡW�&u���\s����'�^��c�1�
+�奔�k#��?�˫�H{�����ڱڹ
+����z����������W�i���V��^����yѥdxn^#g �0�h�BE������@��'4q	��?
+���^O�x��4rib�
�2oDi�*H�����B�˂"J�_"��F ��|B����=�K�g�U���t�	;��7���O��8:�����y��"��@���
��?�8��X�4NU�ކ�~�San��1���]{��%��b&D�~�rh����Z�`�0�ϸ;	�5��R.|E�����
+�GD0N١%��Jqx��\b��#�i�	$�=*��vdf!�\��nl@����(�G🄏�ʞ��O�I?��d�A������$ԴԵ&zIՕ�GuK-���߽�����`˭�dj�6l�����P��/=y��8[J�dq������C��ɸ�N�z\��w^��e\��(���L�i����,$�0�g'M}��=ެ����	)��x���5�%#�I�f�§���������2Wb+��z��Nr��2��Y���Q2���KO�v}��7k߿K�N�i�Z˧RВvb��}���8�oıO���U��d�y�������]�#X��>�]�Չ$�me#�m#��O�5�:��Rd�<��0!�%|�f]�>�}���=���W�#�.2���>�A�Wz��P���~x�1�����3'�����X5y�-4�����;U],&q�Սf��d��G�F	I$$0[~
+޸+:O�?7e�O�[�E����3匍b��g��P��������@C���{s-��
+B��G�+>��cw�!jmm��"8����*��k�X0i3D:떕�|�E3+�����8,��&�E��z��x�w��)����n�v��߃��պQS~1�E\N_վ]��<F���EV
���W�+X����n�z��rt��P�0��>A��1h֚h���i�{�QjU�6���]�ݛ���ZP��L$:xEf�a{�g���]u	ZK���8vyQ�͞��#����g�#��S�rRĸEc��W����.�u�K�@�f�/̙�ZPH�<$V!�y�&|iT�I�D��~@˙��=?���	ǰ,31��e�R�ID�n0��h���.�P�����������a�(`+T���k���oR3��ʀQH
+�mO���AOOq%��#lQ%Ĩ���#(�G�~��۴Z��xx-%��a\v���J,��RH<8TL�\�X����0�m��H��0���\[���k�)BI��Q�`��U�֚8%��撘D���PLm[Ȭ��x8�RnU�-6	�S�@��j,ޠT�R�A�
+���E��T�KX�1;Tc9s.���N�y�,�F��3w%�F�_��
+���\����ϳг�&G�տF��F�<v��LrO��:�=�S������W���d��W���{��W���`���x���"^�:�&쥗p�8��f�(6����?�|=Ȍ-Q�*�T{�N6�O�h���4���Eu�9�F}>gK������a�5&t�E%ED��=�[��\�d,�⻦�A����1˲�A{�4�%(N:�J�G_��z��O�qO������*��<F���Z��Y)R�{����n)�d�d��:�
+a���.R-Y!@�4�"G
+����9�fھ���>�.�aU�s���J��&��%MA�d:�[ceV��ڤ�j0$�Qs�����fP\PJp���G�顧�}q�P����>y�;�uW)d��h�%_��xjj<q����,�؇���9�렩�HvPCx�t�K7hCkP$	X�����U��|�5W[1��u��'Q[�*��S��0([	�"
J��,`Ɓ�6n���.�.�y߬+Rq�f��C�u7a9�Sp��*:�a�T�tQ�������zO�+�ūj�'����Dd��V:�0>P
��,?x�\7��D$��@7���B�{/	ü������O3.�溯L�=h0�7	�s-G��N�WC��2�eJ��鷓�!���&��/;ޥj@?��gʤ���##a�)֩R��%Da1��'o	�Az�ދ�QUS�d�ZAR��:z��?-v��2}}-���o��}~"�J�(���D�����#z	b�r�%��b����Q����GbG���ٍ�p)jJ�Z?
+n�;mN��0qq��A^g��G�-.ȃ��jV�M�k�KBX�����|�y��H)N[�w0�-�x6���u���&��S��W��M�|ǋ�k ��s
�!�I[�Ë㸵
+|�P5�JYg�ı��g��.6	�yv���G2R5��I�H���o{��dT�~�,���H��OY��������|����؊�=^�s���=^��v��n�/m��)�n�Q�R�8����GU/���ܓ��xj=��,������!3v3Ӯ�g��`5Ít�$uC1�+2��Z�;v�%RE���p���8[qK���>���^X3j���-�z@+����A�b}a��^S�[���,3� �Z�`���َR���ؤU��
+�,�|�)H��]��Q�s�M��yZ��Z�&"BO����)3ƎO��K�o�/R��꺥���=f�&��*�MK���@Er�V�}]k+�P�����^��I�����0ۉ��}Z^i���L3or�S��p���JP�G]zg���1�R��9G��J�3����b�W��hg�森\rS!1���h���+tgu�F��P�~��{<�6�J
>eI�#�#L�l�dFګ��΄e��a��:��٪�������h)�RVlؾ��N��ٲ�F�A�J?��a��~]JZP��zh��v��,u�.#E��5s&I�Tǈb�K��|�i᧎Y��vFͅ��t���]�t:�jx�Qۤ���@fj�zO�����#W���,�SE]���i:�J�rC;�yT�M��@m�s}W���(�������Q�kp�Jw�jJ�p�?�/�A��1�*�����w1A�mF���t?3A=3I-5m$�mte`Y%M���,G�ʝAُβL���`�J����*��Qk�}h����9�5�v[�FGG��*
+f�����Zd��Sɸ�Ҿ��Ď�g��?�PA�����&2��vnH*:�Q�?z�G���[�8O�~�V�w�8fA�u���&A�Wt�i�V�2�4�--l�e~[�)<��y
���`e%�Ɉ)���4u��ڵfN��kr�(�VIHvv4�������h��e�Nb�}gz�ųZ����Y'"� ������C�Px�$`@�X�M6�ᤄ����\��M�{9�[�$$���:�8��o������=AZ)y'�Ie>ީ��a��z�z��J�E{��W���{��Tj�������{����=����v�o�s�x}�m�{�o�/٧�
+O�
+�K�=s�F��M�^㙷�ñ����Ǖ�q���Q���l>����|�t��͕Gs��d!�8U����hg�~P���-��R��Z���8���E��c�����w0��P����c�VbL��\
+�Pb9�.�E��ZI�֋�+�)���J����nY�֗Lh2:�ڑ�h����y���"�����ъ]#�"@j�+@���x�=v�����WgN���z8f�)��v��	�¨"b.!H �� ���7:ˋY�PG�NofH-[˘�J�M�}fYR�z��I�iu�ߡ�D�����r��8TY�5��@�2��B*�H1��KW-[�D�4�$��b
����Jv�a8���袍�ޛ{��˒B˒�$� ���>m��g�(��M��lQHh������$!$��� VX�ڃ��{E�\L�i���T���+�W���}:�o��xwLz{�iRb�Dx֢k�㕩�f�ʅV?�*d�a�1.���������>yz�vah���kA�
l%&��38j<(-���*z��fJ]>��&�o��Vf��V,�̓|���K��[���&d6���ƌ�.�w_�x4��숏�}E�T= ��}C蟩̷���=fM�WW�a�#�a��+#�̔�fBkU��V$��w��٪u)�����G�P{qnť��Z�r�P�-i�
 �Ҹ��z�_���s��\u��z��0,��sef	�Rau��,:��u(�����ً������/�2@�]��e�(�t��-H
+%CT���#�DG'`���E�S�f?SWM���7��M4�N�ɹ����:���!�>�4>m� ���t
����-�@oĮHc�M�2�C�y�|����?L��S�����(��~5~�'�8�����)y'�Ie>'��e��$=^���1��^��:�
+ű����B�=E=,��7`̈�I�byd��#e0��0��F>U������q
+���-D/����U��m9�������\���o���w�i2��n�Md�6k�a|ư���8^6%$J�&t�i{��y+a��D�HHj5ߛ����҅�(Rʴ���D]XG�j/MY���b�Q:��|n���Jj^�fǅx��e�먢(�nr;���'���	+�xA٠n��O���J��eD���Rp�
Q�}y�S������e�j��
�Z6�E�A�
[�Z��v��X?��J=+W�3�-��.cZ�B�k�h��[q2�%ѱi�ѓV�xy�"���Lq�޺:��*�������%��x���{h)>��v~�r�}�y?��O����0��u��8� @�J���Cʺ��R��Ϗ(������Ѣ>��}�+���>�i�����՘��;
+���������t8�t�ײ�U$=b���F
+K�	����Y1С��͗�6� �!zT@�j����;8R7�/�delZ|r����1U̲y�PU+H$����{h;qs[�p��>����;s�����z�>T1������L��h�;�������ٳ(`�����12ө��2����
਱�����av��F�M0���۵��\2T�}ډ\�|< bzϠ�W�=
+�7��,ɘ���\�f,�<�y�0��8�EUeUCy��K9mI�{�/���.N�hI��P��$����
+2G��GԼ�
nO�(e��0�Rz�&��&��@�^"6����u��u�R�J�Ch<M�;�j壪C�>�RNĞ�[N~�$�z��[s�<���� '��靗�5<�
+�3W���8��/�W����&u���t������z�z\�J�E{��U�~��G�$zz��,��O6�B��ʍ�QC=
=M�g�W����X�YMȶ���v�֭Z[fd�Ec�j��^g���0� u=@�h(���[�g�T�r�Q���2�E�:w����sFO
+q(�y�Q�9%�;t�'�s.�[�`�i�vs�9�G��i�2���$h&a&�ҋ^!Q�	��/��38���g��I"�*eX?��/�[lM抳s�m+��
+�qx��ք쵛B{����y	�c�ç֓9��4�W�L�G���̆�%�8�3`j���P,�1k�!�n6⚈8�m�vռü�l�Q1�4�|1:�:����\2j�E����<pX1�I�O��������c��?S��G�JߗWOW�ik��h����%s��c�%+�?�qɈ���П
+���<'��P]�z5�Ϛ7X�9ds��(j�nP[�/FO=^�S�-���q�D�1�vbL�������=^�5_���pT+�g��/��*p�/�4�1[��֚Bb?�w��U!z��RQ�L<f�0��G��P怊��O���
+A���yL��1���6B��,����妵N��¨����=EeA��!�Mx[|+�����SZ�M'���3��������un]�c���`�{}<����D~�*�%Ru��0����:��7g���6��&����R/+H�*"�Sp����W������|;i�W����&u�x��v����W�I���|z��حF	�z{ƨqJP�z_6�m���ߊ���{ ��u��l�G�],=��#�b�����/�����
��?�O翱ן���k��2��O�U��^�a�W?�ᱯ�������:��6}����C�3I�J��k��3����65����?���^�ϴ~5��x��i?�W�
{��}\�ۆƿ������������ƽ����'�*����ǫ��p���C���{�{�>���_�����J��kg���w��J�� �҂��Y��ؾ)G�U��Q#�TQa�G0jh�@��k�ےf�e�Z�P胨��V!��v�
 �b���@i)��)D�@�m�
џ��Y_����+\�>���ň�.���R�������/�aq��m�r�����At_���~�k�󗞯W���������W��'=^�G��6�o�цb�1���t:��c�8�v����W�	��D��/#�����^z�]���Hk�o?����T��@t]�o6�?�+��b��꘾�z6-|�E������W��x��'�Y��2��E�1(qM0-@1�	L�//�+��7��=^��Y�]���C��}|��T_�t�U���:y��{�<�z����z�^��O=^����ˊ�{�<�n����y��u���z�^��O=^�㧞�W~���s����<�z�/��}\�z�㯞�W��8����W�t�������x>���������|y����Zimage/jpeg_,http://tika.apache.org/mattmann_cover150.jpgO^bplist00�fgX$versionX$objectsY$ar [...]
+
 !R$6S$10R$2R$7R$3S$11R$8V$classR$4R$9R$0R$5R$1���������#$%&[NS.relativeWNS.base���_,http://tika.apache.org/mattmann_cover150.jpg�*+,-Z$classnameX$classesUNSURL�./UNSURLXNSObject#A��������3456@WNS.keysZNS.objects��789:;<=>?�	�
+���
�����ABCDEFGHI���������VServerZConnection\Content-Type]Last-Modified]Accept-RangesTDate^Content-LengthZKeep-AliveTEtag_:Apache/2.3.15-dev (Unix) mod_ssl/2.3.15-dev OpenSSL/1.0.0cZKeep-AliveZimage/jpeg_Tue, 24 Aug 2010 14:15:18 GMTUbytes_Tue, 13 Dec 2011 18:55:14 GMTU19086_timeout=5, max=100_"c22e47-4a8e-48e9265046980"�*+]^_NSMutableDictionary�]_/\NSDictionaryJ��*+bc_NSHTTPURLResponse�de/_NSHTTPURLResponse]NSURLResponse_NSKeyedArchiver�hi_WebResourceRespons [...]
1<Nd���QgQqQwQxQ�Q�Q�Q�`e`n`�e=eFf/f9ffj�j���������������i�r...>)2�
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml
index 54667aa..765138d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml
@@ -21,9 +21,10 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
new file mode 100644
index 0000000..859d3bd
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
+import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class MidiParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 6343278584336189432L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("x-midi"),
+                MediaType.audio("midi"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // MidiSystem expects the stream to support the mark feature
+        if (! stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+        try {
+            Sequence sequence = MidiSystem.getSequence(stream);
+
+            Track[] tracks = sequence.getTracks();
+            metadata.set("tracks", String.valueOf(tracks.length));
+            // TODO: Use XMPDM.TRACKS?
+
+            Patch[] patches = sequence.getPatchList();
+            metadata.set("patches", String.valueOf(patches.length));
+
+            float type = sequence.getDivisionType();
+            if (type == Sequence.PPQ) {
+                metadata.set("divisionType", "PPQ");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", "SMPTE_24");
+            } else if (type == Sequence.SMPTE_25) {
+                metadata.set("divisionType", "SMPTE_25");
+            } else if (type == Sequence.SMPTE_30) {
+                metadata.set("divisionType", "SMPTE_30");
+            } else if (type == Sequence.SMPTE_30DROP) {
+                metadata.set("divisionType", "SMPTE_30DROP");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", String.valueOf(type));
+            }
+
+            for (Track track : tracks) {
+                xhtml.startElement("p");
+                for (int i = 0; i < track.size(); i++) {
+                    MidiMessage message = track.get(i).getMessage();
+                    if (message instanceof MetaMessage) {
+                        MetaMessage meta = (MetaMessage) message;
+                        // Types 1-15 are reserved for text events
+                        if (meta.getType() >= 1 && meta.getType() <= 15) {
+                            // FIXME: What's the encoding?
+                            xhtml.characters(
+                                    new String(meta.getData(), ISO_8859_1));
+                        }
+                    }
+                }
+                xhtml.endElement("p");
+            }
+        } catch (InvalidMidiDataException ignore) {
+            // There is no way to know whether this exception was
+            // caused by the document being corrupted or by the format
+            // just being unsupported. So we do nothing.
+        }
+
+        xhtml.endDocument();
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
new file mode 100644
index 0000000..b7d2d75
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+    private ID3Tags[] tags;
+
+    public CompositeTagHandler(ID3Tags[] tags) {
+        this.tags = tags;
+    }
+
+    public boolean getTagsPresent() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTagsPresent()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public String getTitle() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTitle() != null) {
+                return tag.getTitle();
+            }
+        }
+        return null;
+    }
+
+    public String getArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getArtist() != null) {
+                return tag.getArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbum() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbum() != null) {
+                return tag.getAlbum();
+            }
+        }
+        return null;
+    }
+
+    public String getComposer() {
+        for (ID3Tags tag : tags) {
+            if (tag.getComposer() != null) {
+                return tag.getComposer();
+            }
+        }
+        return null;
+    }
+
+    public String getYear() {
+        for (ID3Tags tag : tags) {
+            if (tag.getYear() != null) {
+                return tag.getYear();
+            }
+        }
+        return null;
+    }
+
+    public List<ID3Comment> getComments() {
+        for (ID3Tags tag : tags) {
+            List<ID3Comment> comments = tag.getComments();
+            if (comments != null && comments.size() > 0) {
+                return comments;
+            }
+        }
+        return Collections.emptyList();
+    }
+
+    public String getGenre() {
+        for (ID3Tags tag : tags) {
+            if (tag.getGenre() != null) {
+                return tag.getGenre();
+            }
+        }
+        return null;
+    }
+
+    public String getTrackNumber() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTrackNumber() != null) {
+                return tag.getTrackNumber();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbumArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbumArtist() != null) {
+                return tag.getAlbumArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getDisc() {
+        for (ID3Tags tag : tags) {
+            if (tag.getDisc() != null) {
+                return tag.getDisc();
+            }
+        }
+        return null;
+    }
+
+    public String getCompilation() {
+        for (ID3Tags tag : tags) {
+            if (tag.getCompilation() != null) {
+                return tag.getCompilation();
+            }
+        }
+        return null;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
new file mode 100644
index 0000000..b8d723f
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.List;
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ *  such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ *  tag, or if the tag isn't defined for the version.
+ *  
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ *  future, we may wish to add more to cover the extra tags that
+ *  our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+    /**
+     * List of predefined genres.
+     *
+     * See <a href="http://www.id3.org/id3v2-00">http://www.id3.org/id3v2-00</a>
+     */
+    String[] GENRES = new String[] {
+        /*  0 */ "Blues",
+        /*  1 */ "Classic Rock",
+        /*  2 */ "Country",
+        /*  3 */ "Dance",
+        /*  4 */ "Disco",
+        /*  5 */ "Funk",
+        /*  6 */ "Grunge",
+        /*  7 */ "Hip-Hop",
+        /*  8 */ "Jazz",
+        /*  9 */ "Metal",
+        /* 10 */ "New Age",
+        /* 11 */ "Oldies",
+        /* 12 */ "Other",
+        /* 13 */ "Pop",
+        /* 14 */ "R&B",
+        /* 15 */ "Rap",
+        /* 16 */ "Reggae",
+        /* 17 */ "Rock",
+        /* 18 */ "Techno",
+        /* 19 */ "Industrial",
+        /* 20 */ "Alternative",
+        /* 21 */ "Ska",
+        /* 22 */ "Death Metal",
+        /* 23 */ "Pranks",
+        /* 24 */ "Soundtrack",
+        /* 25 */ "Euro-Techno",
+        /* 26 */ "Ambient",
+        /* 27 */ "Trip-Hop",
+        /* 28 */ "Vocal",
+        /* 29 */ "Jazz+Funk",
+        /* 30 */ "Fusion",
+        /* 31 */ "Trance",
+        /* 32 */ "Classical",
+        /* 33 */ "Instrumental",
+        /* 34 */ "Acid",
+        /* 35 */ "House",
+        /* 36 */ "Game",
+        /* 37 */ "Sound Clip",
+        /* 38 */ "Gospel",
+        /* 39 */ "Noise",
+        /* 40 */ "AlternRock",
+        /* 41 */ "Bass",
+        /* 42 */ "Soul",
+        /* 43 */ "Punk",
+        /* 44 */ "Space",
+        /* 45 */ "Meditative",
+        /* 46 */ "Instrumental Pop",
+        /* 47 */ "Instrumental Rock",
+        /* 48 */ "Ethnic",
+        /* 49 */ "Gothic",
+        /* 50 */ "Darkwave",
+        /* 51 */ "Techno-Industrial",
+        /* 52 */ "Electronic",
+        /* 53 */ "Pop-Folk",
+        /* 54 */ "Eurodance",
+        /* 55 */ "Dream",
+        /* 56 */ "Southern Rock",
+        /* 57 */ "Comedy",
+        /* 58 */ "Cult",
+        /* 59 */ "Gangsta",
+        /* 60 */ "Top 40",
+        /* 61 */ "Christian Rap",
+        /* 62 */ "Pop/Funk",
+        /* 63 */ "Jungle",
+        /* 64 */ "Native American",
+        /* 65 */ "Cabaret",
+        /* 66 */ "New Wave",
+        /* 67 */ "Psychadelic",
+        /* 68 */ "Rave",
+        /* 69 */ "Showtunes",
+        /* 70 */ "Trailer",
+        /* 71 */ "Lo-Fi",
+        /* 72 */ "Tribal",
+        /* 73 */ "Acid Punk",
+        /* 74 */ "Acid Jazz",
+        /* 75 */ "Polka",
+        /* 76 */ "Retro",
+        /* 77 */ "Musical",
+        /* 78 */ "Rock & Roll",
+        /* 79 */ "Hard Rock",
+        /* 80 */ "Folk",
+        /* 81 */ "Folk-Rock",
+        /* 82 */ "National Folk",
+        /* 83 */ "Swing",
+        /* 84 */ "Fast Fusion",
+        /* 85 */ "Bebob",
+        /* 86 */ "Latin",
+        /* 87 */ "Revival",
+        /* 88 */ "Celtic",
+        /* 89 */ "Bluegrass",
+        /* 90 */ "Avantgarde",
+        /* 91 */ "Gothic Rock",
+        /* 92 */ "Progressive Rock",
+        /* 93 */ "Psychedelic Rock",
+        /* 94 */ "Symphonic Rock",
+        /* 95 */ "Slow Rock",
+        /* 96 */ "Big Band",
+        /* 97 */ "Chorus",
+        /* 98 */ "Easy Listening",
+        /* 99 */ "Acoustic",
+        /* 100 */ "Humour",
+        /* 101 */ "Speech",
+        /* 102 */ "Chanson",
+        /* 103 */ "Opera",
+        /* 104 */ "Chamber Music",
+        /* 105 */ "Sonata",
+        /* 106 */ "Symphony",
+        /* 107 */ "Booty Bass",
+        /* 108 */ "Primus",
+        /* 109 */ "Porn Groove",
+        /* 110 */ "Satire",
+        /* 111 */ "Slow Jam",
+        /* 112 */ "Club",
+        /* 113 */ "Tango",
+        /* 114 */ "Samba",
+        /* 115 */ "Folklore",
+        /* 116 */ "Ballad",
+        /* 117 */ "Power Ballad",
+        /* 118 */ "Rhythmic Soul",
+        /* 119 */ "Freestyle",
+        /* 120 */ "Duet",
+        /* 121 */ "Punk Rock",
+        /* 122 */ "Drum Solo",
+        /* 123 */ "A capella",
+        /* 124 */ "Euro-House",
+        /* 125 */ "Dance Hall",
+        /* sentinel */ ""
+    };
+
+    /**
+     * Does the file contain this kind of tags?
+     */
+    boolean getTagsPresent();
+
+    String getTitle();
+
+    /**
+     * The Artist for the track
+     */
+    String getArtist();
+
+    /**
+     * The Artist for the overall album / compilation of albums
+     */
+    String getAlbumArtist();
+
+    String getAlbum();
+    
+    String getComposer();
+
+    String getCompilation();
+    
+    /**
+     * Retrieves the comments, if any.
+     * Files may have more than one comment, but normally only 
+     *  one with any language/description pair.
+     */
+    List<ID3Comment> getComments();
+
+    String getGenre();
+
+    String getYear();
+
+    /**
+     * The number of the track within the album / recording
+     */
+    String getTrackNumber();
+
+    /**
+     * The number of the disc this belongs to, within the set
+     */
+    String getDisc();
+
+    /**
+     * Represents a comments in ID3 (especially ID3 v2), where are 
+     *  made up of several parts
+     */
+    class ID3Comment {
+        private String language;
+        private String description;
+        private String text;
+        
+        /**
+         * Creates an ID3 v1 style comment tag
+         */
+        public ID3Comment(String id3v1Text) {
+           this.text = id3v1Text;
+        }
+        /**
+         * Creates an ID3 v2 style comment tag
+         */
+        public ID3Comment(String language, String description, String text) {
+            this.language = language;
+            this.description = description;
+            this.text = text;
+        }
+
+        /**
+         * Gets the language, if present
+         */
+        public String getLanguage() {
+           return language;
+        }
+        /**
+         * Gets the description, if present
+         */
+        public String getDescription() {
+           return description;
+        }
+        /**
+         * Gets the text, if present
+         */
+        public String getText() {
+           return text;
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
new file mode 100644
index 0000000..2111356
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file, 
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private ID3Comment comment;
+    private String genre;
+    private String trackNumber;
+
+    boolean found = false;
+
+    public ID3v1Handler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(LyricsHandler.getSuffix(stream, 128));
+    }
+
+    /**
+     * Creates from the last 128 bytes of a stream.
+     * @param tagData Must be the last 128 bytes 
+     */
+    protected ID3v1Handler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if (tagData.length == 128
+                && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+            found = true;
+
+            title = getString(tagData, 3, 33);
+            artist = getString(tagData, 33, 63);
+            album = getString(tagData, 63, 93);
+            year = getString(tagData, 93, 97);
+            
+            String commentStr = getString(tagData, 97, 127);
+            comment = new ID3Comment(commentStr);
+
+            int genreID = (int) tagData[127] & 0xff; // unsigned byte
+            genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+            // ID3v1.1 Track addition
+            // If the last two bytes of the comment field are zero and
+            // non-zero, then the last byte is the track number
+            if (tagData[125] == 0 && tagData[126] != 0) {
+                int trackNum = (int) tagData[126] & 0xff;
+                trackNumber = Integer.toString(trackNum);
+            }
+        }
+    }
+
+
+    public boolean getTagsPresent() {
+        return found;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public List<ID3Comment> getComments() {
+       return Arrays.asList(comment);
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+    
+    /**
+     * ID3v1 doesn't have composers,
+     *  so returns null;
+     */
+    public String getComposer() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have album-wide artists,
+     *  so returns null;
+     */
+    public String getAlbumArtist() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have disc numbers,
+     *  so returns null;
+     */
+    public String getDisc() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    /**
+     * Returns the identified ISO-8859-1 substring from the given byte buffer.
+     * The return value is the zero-terminated substring retrieved from
+     * between the given start and end positions in the given byte buffer.
+     * Extra whitespace (and control characters) from the beginning and the
+     * end of the substring is removed.
+     *
+     * @param buffer byte buffer
+     * @param start start index of the substring
+     * @param end end index of the substring
+     * @return the identified substring
+     * @throws TikaException if the ISO-8859-1 encoding is not available
+     */
+    private static String getString(byte[] buffer, int start, int end)
+            throws TikaException {
+        // Find the zero byte that marks the end of the string
+        int zero = start;
+        while (zero < end && buffer[zero] != 0) {
+            zero++;
+        }
+
+        // Skip trailing whitespace
+        end = zero;
+        while (start < end && buffer[end - 1] <= ' ') {
+            end--;
+        }
+
+        // Skip leading whitespace
+        while (start < end && buffer[start] <= ' ') {
+            start++;
+        }
+
+        // Return the remaining substring
+        return new String(buffer, start, end - start, ISO_8859_1);
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
new file mode 100644
index 0000000..8d94c0b
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v22Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV22TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TAL")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYE")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPA")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCO")) {
+                genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+        return ID3v2Frame.getComment(data, offset, length);
+    }
+    
+    protected static String extractGenre(String rawGenre) {
+       int open = rawGenre.indexOf("(");
+       int close = rawGenre.indexOf(")");
+       if (open == -1 && close == -1) {
+          return rawGenre;
+       } else if (open < close) {
+           String genreStr = rawGenre.substring(0, open).trim();
+           try {
+               int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+               return ID3Tags.GENRES[genreID];
+           } catch(ArrayIndexOutOfBoundsException invalidNum) {
+              return genreStr;
+           } catch(NumberFormatException notANum) {
+              return genreStr;
+           }
+       } else {
+          return null;
+       }
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+    
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    /**
+     * ID3v22 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    private class RawV22TagIterator extends RawTagIterator {
+        private RawV22TagIterator(ID3v2Frame frame) {
+            frame.super(3, 3, 1, 0);
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
new file mode 100644
index 0000000..4b67eda
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private String compilation;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v23Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV23TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCOM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPOS")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCMP")) {
+                compilation = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+                genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+       return ID3v2Frame.getComment(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    public String getCompilation() {
+        return compilation;
+    }
+
+    private class RawV23TagIterator extends RawTagIterator {
+        private RawV23TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
new file mode 100644
index 0000000..08dfc9d
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to 
+ * be turned into useful data.
+ */
+public class ID3v2Frame implements MP3Frame {
+
+    private static final int MAX_RECORD_SIZE = 1_000_000;
+    private int majorVersion;
+    private int minorVersion;
+    private int flags;
+    private int length;
+    /** Excludes the header size part */
+    private byte[] extendedHeader;
+    private byte[] data;
+
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    public int getFlags() {
+        return flags;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public byte[] getExtendedHeader() {
+        return extendedHeader;
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Returns the next ID3v2 Frame in
+     *  the file, or null if the next batch of data
+     *  doesn't correspond to either an ID3v2 header.
+     * If no ID3v2 frame could be detected and the passed in input stream is a
+     * {@code PushbackInputStream}, the bytes read so far are pushed back so
+     * that they can be read again.
+     * ID3v2 Frames should come before all Audio ones.
+     */
+    public static MP3Frame createFrameIfPresent(InputStream inp)
+            throws IOException {
+        int h1 = inp.read();
+        int h2 = inp.read();
+        int h3 = inp.read();
+        
+        // Is it an ID3v2 Frame? 
+        if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+            int majorVersion = inp.read();
+            int minorVersion = inp.read();
+            if (majorVersion == -1 || minorVersion == -1) {
+                pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
+                return null;
+            }
+            return new ID3v2Frame(majorVersion, minorVersion, inp);
+        }
+
+        // Not a frame header
+        pushBack(inp, h1, h2, h3);
+        return null;
+    }
+
+    /**
+     * Pushes bytes back into the stream if possible. This method is called if
+     * no ID3v2 header could be found at the current stream position.
+     * 
+     * @param inp the input stream
+     * @param bytes the bytes to be pushed back
+     * @throws IOException if an error occurs
+     */
+    private static void pushBack(InputStream inp, int... bytes)
+            throws IOException
+    {
+        if (inp instanceof PushbackInputStream)
+        {
+            byte[] buf = new byte[bytes.length];
+            for (int i = 0; i < bytes.length; i++)
+            {
+                buf[i] = (byte) bytes[i];
+            }
+            ((PushbackInputStream) inp).unread(buf);
+        }
+    }
+
+    private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+            throws IOException {
+        this.majorVersion = majorVersion;
+        this.minorVersion = minorVersion;
+
+        // Get the flags and the length
+        flags = inp.read();
+        length = get7BitsInt(readFully(inp, 4), 0);
+
+        // Do we have an extended header?
+        if ((flags & 0x02) == 0x02) {
+            int size = getInt(readFully(inp, 4));
+            extendedHeader = readFully(inp, size);
+        }
+
+        // Get the frame's data, or at least as much
+        //  of it as we could do
+        data = readFully(inp, length, false);
+    }
+
+    protected static int getInt(byte[] data) {
+        return getInt(data, 0);
+    }
+
+    protected static int getInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        int b3 = data[offset+3] & 0xFF;
+        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+    }
+
+    protected static int getInt3(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        return (b0 << 16) + (b1 << 8) + (b2 << 0);
+    }
+
+    protected static int getInt2(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        return (b0 << 8) + (b1 << 0);
+    }
+
+    /**
+     * AKA a Synchsafe integer.
+     * 4 bytes hold a 28 bit number. The highest
+     *  bit in each byte is always 0 and always ignored.
+     */
+    protected static int get7BitsInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0x7F;
+        int b1 = data[offset+1] & 0x7F;
+        int b2 = data[offset+2] & 0x7F;
+        int b3 = data[offset+3] & 0x7F;
+        return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+    }
+
+    protected static byte[] readFully(InputStream inp, int length)
+            throws IOException {
+       return readFully(inp, length, true);
+    }
+    protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
+            throws IOException {
+        if (length > MAX_RECORD_SIZE) {
+            throw new IOException("Record size ("+length+
+                    " bytes) is larger than the allowed record size: "+MAX_RECORD_SIZE);
+        }
+        byte[] b = new byte[length];
+
+        int pos = 0;
+        int read;
+        while (pos < length) {
+            read = inp.read(b, pos, length-pos);
+            if (read == -1) {
+                if(shortDataIsFatal) {
+                   throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+                } else {
+                   // Give them what we found
+                   // TODO Log the short read
+                   return b;
+                }
+            }
+            pos += read;
+        }
+
+        return b;
+    }
+    
+    protected static class TextEncoding {
+       public final boolean doubleByte;
+       public final String encoding;
+       private TextEncoding(String encoding, boolean doubleByte) {
+          this.doubleByte = doubleByte;
+          this.encoding = encoding;
+       }
+    }
+    protected static final TextEncoding[] encodings = new TextEncoding[] {
+          new TextEncoding("ISO-8859-1", false),
+          new TextEncoding("UTF-16", true), // With BOM
+          new TextEncoding("UTF-16BE", true), // Without BOM
+          new TextEncoding("UTF-8", false)
+    };
+
+    /**
+     * Returns the (possibly null padded) String at the given offset and
+     * length. String encoding is held in the first byte; 
+     */
+    protected static String getTagString(byte[] data, int offset, int length) {
+        int actualLength = length;
+        if (actualLength == 0) {
+            return "";
+        }
+        if (actualLength == 1 && data[offset] == 0) {
+            return "";
+        }
+
+        // Does it have an encoding flag?
+        // Detect by the first byte being sub 0x20
+        TextEncoding encoding = encodings[0];
+        byte maybeEncodingFlag = data[offset];
+        if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
+            offset++;
+            actualLength--;
+            encoding = encodings[maybeEncodingFlag];
+        }
+        
+        // Trim off null termination / padding (as present) 
+        while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
+           actualLength -= 2;
+        } 
+        while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
+           actualLength--;
+        }
+        if (actualLength == 0) {
+           return "";
+        }
+
+        // TIKA-1024: If it's UTF-16 (with BOM) and all we
+        // have is a naked BOM then short-circuit here
+        // (return empty string), because new String(..)
+        // gives different results on different JVMs
+        if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+            ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+             (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+          return "";
+        }
+
+        try {
+            // Build the base string
+            return new String(data, offset, actualLength, encoding.encoding);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(
+                    "Core encoding " + encoding.encoding + " is not available", e);
+        }
+    }
+    /**
+     * Builds up the ID3 comment, by parsing and extracting
+     *  the comment string parts from the given data. 
+     */
+    protected static ID3Comment getComment(byte[] data, int offset, int length) {
+       // Comments must have an encoding
+       int encodingFlag = data[offset];
+       if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+          // Good, valid flag
+       } else {
+          // Invalid string
+          return null;
+       }
+       
+       TextEncoding encoding = encodings[encodingFlag];
+       
+       // First is a 3 byte language
+       String lang = getString(data, offset+1, 3);
+       
+       // After that we have [Desc]\0(\0)[Text]
+       int descStart = offset+4;
+       int textStart = -1;
+       String description = null;
+       String text = null;
+       
+       // Find where the description ends
+       try {
+          for (int i=descStart; i<offset+length; i++) {
+             if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
+                // Handle LE vs BE on low byte text
+                if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
+                   i++;
+                }
+                textStart = i+2;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+             if (!encoding.doubleByte && data[i]==0) {
+                textStart = i+1;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+          }
+          
+          // Did we find the end?
+          if (textStart > -1) {
+             text = new String(data, textStart, offset+length-textStart, encoding.encoding);
+          } else {
+             // Assume everything is the text
+             text = new String(data, descStart, offset+length-descStart, encoding.encoding);
+          }
+          
+          // Return
+          return new ID3Comment(lang, description, text);
+       } catch (UnsupportedEncodingException e) {
+          throw new RuntimeException(
+                  "Core encoding " + encoding.encoding + " is not available", e);
+       }
+    }
+
+    /**
+     * Returns the String at the given
+     *  offset and length. Strings are ISO-8859-1 
+     */
+    protected static String getString(byte[] data, int offset, int length) {
+        return new String(data, offset, length, ISO_8859_1);
+    }
+
+
+    /**
+     * Iterates over id3v2 raw tags.
+     * Create an instance of this that configures the
+     *  various length and multipliers.
+     */
+    protected class RawTagIterator implements Iterator<RawTag> {
+        private int nameLength;
+        private int sizeLength;
+        private int sizeMultiplier;
+        private int flagLength;
+
+        private int offset = 0;
+
+        protected RawTagIterator(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength) {
+            this.nameLength = nameLength;
+            this.sizeLength = sizeLength;
+            this.sizeMultiplier = sizeMultiplier;
+            this.flagLength = flagLength;
+        }
+
+        public boolean hasNext() {
+            // Check for padding at the end
+            return offset < data.length && data[offset] != 0;
+        }
+
+        public RawTag next() {
+            RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+                    flagLength, data, offset);
+            offset += tag.getSize();
+            return tag;
+        }
+
+        public void remove() {
+        }
+
+    }
+
+    protected static class RawTag {
+        private int headerSize;
+        protected String name;
+        protected int flag;
+        protected byte[] data;
+
+        private RawTag(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength, byte[] frameData, int offset) {
+            headerSize = nameLength + sizeLength + flagLength;
+
+            // Name, normally 3 or 4 bytes
+            name = getString(frameData, offset, nameLength);
+
+            // Size
+            int rawSize;
+            if (sizeLength == 3) {
+                rawSize = getInt3(frameData, offset+nameLength);
+            } else {
+                rawSize = getInt(frameData, offset+nameLength);
+            }
+            int size = rawSize * sizeMultiplier;
+
+            // Flag
+            if (flagLength > 0) {
+                if (flagLength == 1) {
+                    flag = (int)frameData[offset+nameLength+sizeLength];
+                } else {
+                    flag = getInt2(frameData, offset+nameLength+sizeLength);
+                }
+            }
+
+            // Now data
+            int copyFrom = offset+nameLength+sizeLength+flagLength;
+            size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+            data = new byte[size];
+            System.arraycopy(frameData, copyFrom, data, 0, size);
+        }
+
+        protected int getSize() {
+            return headerSize + data.length;
+        }
+
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
new file mode 100644
index 0000000..12d0f2d
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ *  from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+    boolean foundLyrics = false;
+    String lyricsText = null;
+    ID3v1Handler id3v1 = null;
+
+    public LyricsHandler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(getSuffix(stream, 10240+128));
+    }
+
+    /**
+     * Looks for the Lyrics data, which will be
+     *  just before the ID3v1 data (if present),
+     *  and process it.
+     * Also sets things up for the ID3v1
+     *  processing if required.
+     * Creates from the last 128 bytes of a stream.
+     */
+    protected LyricsHandler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if(tagData.length < 128) {
+            return;
+        }
+
+        // Is there ID3v1 data?
+        byte[] last128 = new byte[128];
+        System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+        id3v1 = new ID3v1Handler(last128);
+
+        if(tagData.length < 137) {
+            return;
+        }
+
+        // Are there lyrics? Look for the closing Lyrics tag
+        //  at the end to decide if there is any
+        int lookat = tagData.length - 9;
+        if(id3v1.found) {
+            lookat -= 128;
+        }
+        if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' && 
+                tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+                tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+                tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+                tagData[lookat+8] == '0') {
+            foundLyrics = true;
+
+            // The length (6 bytes) comes just before LYRICS200, and is the
+            //  size including the LYRICSBEGIN but excluding the 
+            //  length+LYRICS200 at the end.
+            int length = Integer.parseInt(
+                    new String(tagData, lookat-6, 6, UTF_8)
+            );
+
+            String lyrics = new String(
+                    tagData, lookat-length+5, length-11,
+                    US_ASCII
+            );
+
+            // Tags are a 3 letter code, 5 digit length, then data
+            int pos = 0;
+            while(pos < lyrics.length()-8) {
+                String tagName = lyrics.substring(pos, pos+3);
+                int tagLen = Integer.parseInt(
+                        lyrics.substring(pos+3, pos+8)
+                );
+                int startPos = pos + 8;
+                int endPos = startPos + tagLen;
+
+                if(tagName.equals("LYR")) {
+                    lyricsText = lyrics.substring(startPos, endPos);
+                }
+
+                pos = endPos;
+            }
+        }
+    }
+
+    public boolean hasID3v1() {
+        if(id3v1 == null || id3v1.found == false) {
+            return false;
+        }
+        return true;
+    }
+    public boolean hasLyrics() {
+        return lyricsText != null && lyricsText.length() > 0;
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+    protected static byte[] getSuffix(InputStream stream, int length)
+            throws IOException {
+        byte[] buffer = new byte[2 * length];
+        int bytesInBuffer = 0;
+
+        int n = stream.read(buffer);
+        while (n != -1) {
+            bytesInBuffer += n;
+            if (bytesInBuffer == buffer.length) {
+                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+                bytesInBuffer = length;
+            }
+            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+        }
+
+        if (bytesInBuffer < length) {
+            length = bytesInBuffer;
+        }
+
+        byte[] result = new byte[length];
+        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+        return result;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
new file mode 100644
index 0000000..72684e1
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TailStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
+ * from an MP3 file, if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
+ */
+public class Mp3Parser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 8537074922934844370L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.audio("mpeg"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
+        metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        // Create handlers for the various kinds of ID3 tags
+        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+        //process as much metadata as possible before
+        //writing to xhtml
+        if (audioAndTags.duration > 0) {
+            metadata.set(XMPDM.DURATION, audioAndTags.duration);
+        }
+
+        if (audioAndTags.audio != null) {
+            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+            metadata.set("version", audioAndTags.audio.getVersion());
+
+            metadata.set(
+                    XMPDM.AUDIO_SAMPLE_RATE,
+                    Integer.toString(audioAndTags.audio.getSampleRate()));
+            if(audioAndTags.audio.getChannels() == 1) {
+                metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
+            } else if(audioAndTags.audio.getChannels() == 2) {
+                metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
+            } else if(audioAndTags.audio.getChannels() == 5) {
+                metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
+            } else if(audioAndTags.audio.getChannels() == 7) {
+                metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
+            }
+        }
+
+        xhtml.startDocument();
+        // Process tags metadata if the file has supported tags
+        List<String> comments = new ArrayList<>();
+        if (audioAndTags.tags.length > 0) {
+            CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
+
+            metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
+            metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
+            metadata.set(XMPDM.ARTIST, tag.getArtist());
+            metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
+            metadata.set(XMPDM.COMPOSER, tag.getComposer());
+            metadata.set(XMPDM.ALBUM, tag.getAlbum());
+            metadata.set(XMPDM.COMPILATION, tag.getCompilation());
+            metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
+            metadata.set(XMPDM.GENRE, tag.getGenre());
+
+            for (ID3Comment comment : tag.getComments()) {
+                StringBuffer cmt = new StringBuffer();
+                if (comment.getLanguage() != null) {
+                    cmt.append(comment.getLanguage());
+                    cmt.append(" - ");
+                }
+                if (comment.getDescription() != null) {
+                    cmt.append(comment.getDescription());
+                    if (comment.getText() != null) {
+                        cmt.append("\n");
+                    }
+                }
+                if (comment.getText() != null) {
+                    cmt.append(comment.getText());
+                }
+
+                comments.add(cmt.toString());
+                metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
+            }
+
+            // ID3v1.1 Track addition
+            StringBuilder sb = new StringBuilder();
+            sb.append(tag.getAlbum());
+            if (tag.getTrackNumber() != null) {
+                sb.append(", track ").append(tag.getTrackNumber());
+                metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
+            }
+            if (tag.getDisc() != null) {
+                sb.append(", disc ").append(tag.getDisc());
+                metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
+            }
+
+            xhtml.element("h1", tag.getTitle());
+            xhtml.element("p", tag.getArtist());
+
+
+            xhtml.element("p", sb.toString());
+
+            xhtml.element("p", tag.getYear());
+            xhtml.element("p", tag.getGenre());
+        }
+        xhtml.element("p", String.valueOf(audioAndTags.duration));
+        for (String comment : comments) {
+            xhtml.element("p", comment);
+        }
+
+        if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+           xhtml.startElement("p", "class", "lyrics");
+           xhtml.characters(audioAndTags.lyrics.lyricsText);
+           xhtml.endElement("p");
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+     *  for each supported set of tags. 
+     */
+    protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
+           throws IOException, SAXException, TikaException {
+       ID3v24Handler v24 = null;
+       ID3v23Handler v23 = null;
+       ID3v22Handler v22 = null;
+       ID3v1Handler v1 = null;
+       LyricsHandler lyrics = null;
+       AudioFrame firstAudio = null;
+
+       TailStream tailStream = new TailStream(stream, 10240+128);
+       MpegStream mpegStream = new MpegStream(tailStream);
+
+       // ID3v2 tags live at the start of the file
+       // You can apparently have several different ID3 tag blocks
+       // So, keep going until we don't find any more
+       MP3Frame f;
+       while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
+           if(f instanceof ID3v2Frame) {
+               ID3v2Frame id3F = (ID3v2Frame)f;
+               if (id3F.getMajorVersion() == 4) {
+                   v24 = new ID3v24Handler(id3F);
+               } else if(id3F.getMajorVersion() == 3) {
+                   v23 = new ID3v23Handler(id3F);
+               } else if(id3F.getMajorVersion() == 2) {
+                   v22 = new ID3v22Handler(id3F);
+               }
+           }
+       }
+
+        // Now iterate over all audio frames in the file
+        AudioFrame frame = mpegStream.nextFrame();
+        float duration = 0;
+        boolean skipped = true;
+        while (frame != null && skipped)
+        {
+            duration += frame.getDuration();
+            if (firstAudio == null)
+            {
+                firstAudio = frame;
+            }
+            skipped = mpegStream.skipFrame();
+            if (skipped) {
+                frame = mpegStream.nextFrame();
+            }
+        }
+
+       // ID3v1 tags live at the end of the file
+       // Lyrics live just before ID3v1, at the end of the file
+       // Search for both (handlers seek to the end for us)
+       lyrics = new LyricsHandler(tailStream.getTail());
+       v1 = lyrics.id3v1;
+
+       // Go in order of preference
+       // Currently, that's newest to oldest
+       List<ID3Tags> tags = new ArrayList<ID3Tags>();
+
+       if(v24 != null && v24.getTagsPresent()) {
+          tags.add(v24);
+       }
+       if(v23 != null && v23.getTagsPresent()) {
+          tags.add(v23);
+       }
+       if(v22 != null && v22.getTagsPresent()) {
+          tags.add(v22);
+       }
+       if(v1 != null && v1.getTagsPresent()) {
+          tags.add(v1);
+       }
+       
+       ID3TagsAndAudio ret = new ID3TagsAndAudio();
+       ret.audio = firstAudio;
+       ret.lyrics = lyrics;
+       ret.tags = tags.toArray(new ID3Tags[0]);
+       ret.duration = duration;
+       return ret;
+    }
+
+    protected static class ID3TagsAndAudio {
+        private ID3Tags[] tags;
+        private AudioFrame audio;
+        private LyricsHandler lyrics;
+        private float duration;
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..53bc7ab
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,22 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.audio.AudioParser
+org.apache.tika.parser.audio.MidiParser
+org.apache.tika.parser.mp3.Mp3Parser
+org.apache.tika.parser.mp4.MP4Parser
+org.apache.tika.parser.video.FLVParser
+
+
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
new file mode 100644
index 0000000..9cfbab1
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class AudioParserTest {
+
+    @Test
+    public void testWAV() throws Exception {
+        String path = "/test-documents/testWAV.wav";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/vnd.wave", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+    @Test
+    public void testAIFF() throws Exception {
+        String path = "/test-documents/testAIFF.aif";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+    @Test
+    public void testAU() throws Exception {
+        String path = "/test-documents/testAU.au";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                AudioParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("44100.0", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+        assertEquals("", content);
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
new file mode 100644
index 0000000..344f2d7
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class MidiParserTest {
+
+    @Test
+    public void testMID() throws Exception {
+        String path = "/test-documents/testMID.mid";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                MidiParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2", metadata.get("tracks"));
+        assertEquals("0", metadata.get("patches"));
+        assertEquals("PPQ", metadata.get("divisionType"));
+
+        assertContains("Untitled", content);
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
new file mode 100644
index 0000000..83bbc0e
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.junit.Assume;
+import org.junit.Test;
+
+/**
+ * Test case for parsing mp3 files.
+ */
+public class Mp3ParserTest extends TikaTest {
+
+    /**
+     * Checks the duration of an MP3 file.
+     * @param metadata the metadata object
+     * @param expected the expected duration, rounded as seconds
+     */
+    private static void checkDuration(Metadata metadata, int expected) {
+        assertEquals("Wrong duration", expected,
+                Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
+    }
+
+    /**
+     * Test that with only ID3v1 tags, we get some information out   
+     */
+    @Test
+    public void testMp3ParsingID3v1() throws Exception {
+
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3id3v1.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that with only ID3v2 tags, we get the full
+     *  set of information out.
+     */
+    @Test
+    public void testMp3ParsingID3v2() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3id3v2.mp3", metadata);
+
+        // Check core properties
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+
+        // Check the textual contents
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        assertContains(", track 1", content);
+        assertContains(", disc 1", content);
+        
+        // Check un-typed audio properties
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        
+        // Check XMPDM-typed audio properties
+        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+        assertEquals(null, metadata.get(XMPDM.COMPOSER));
+        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+        assertEquals("Rock", metadata.get(XMPDM.GENRE));
+        assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
+        assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
+        assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
+        assertEquals("1", metadata.get(XMPDM.COMPILATION));
+        
+        assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+        assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+        assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that metadata is added before xhtml content
+     * is written...so that more metadata shows up in the xhtml
+     */
+    @Test
+    public void testAddingToMetadataBeforeWriting() throws Exception {
+        String content = getXML("testMP3id3v1.mp3").xml;
+        assertContains("<meta name=\"xmpDM:audioSampleRate\" content=\"44100\"",
+                content);
+        assertContains("<meta name=\"xmpDM:duration\" content=\"2455",
+                content);
+        assertContains("meta name=\"xmpDM:audioChannelType\" content=\"Mono\"", content);
+    }
+    /**
+     * Test that with both id3v2 and id3v1, we prefer the
+     *  details from id3v2
+     */
+    @Test
+    public void testMp3ParsingID3v1v2() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3id3v1_v2.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
+
+    /**
+     * Test that with only ID3v2 tags, of version 2.4, we get the full
+     *  set of information out.
+     */
+    @Test
+    public void testMp3ParsingID3v24() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3id3v24.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        assertContains(", disc 1", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+
+        // Check XMPDM-typed audio properties
+        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
+        assertEquals(null, metadata.get(XMPDM.COMPOSER));
+        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
+        assertEquals("Rock", metadata.get(XMPDM.GENRE));
+        assertEquals("1", metadata.get(XMPDM.COMPILATION));
+        
+        assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
+        assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
+    }
+    
+    /**
+     * Tests that a file with characters not in the ISO 8859-1
+     *  range is correctly handled
+     */
+    @Test
+    public void testMp3ParsingID3i18n() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3i18n.mp3", metadata);
+
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+       assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+       assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+       assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+       assertEquals(
+             "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment", 
+             metadata.get(XMPDM.LOG_COMMENT)
+       );
+       
+       assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+       assertEquals("44100", metadata.get("samplerate"));
+       assertEquals("1", metadata.get("channels"));
+       checkDuration(metadata, 2);
+   }
+    /**
+     * Tests that a file with the last frame slightly
+     * truncated does not cause an EOF and does
+     * not lead to an infinite loop.
+     */
+    @Test
+    public void testMp3ParsingID3i18nTruncated() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3i18n_truncated.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+        assertEquals(
+                "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
+                metadata.get(XMPDM.LOG_COMMENT)
+        );
+
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
+    
+    /**
+     * Tests that a file with both lyrics and
+     *  ID3v2 tags gets both extracted correctly
+     */
+    @Test
+    public void testMp3ParsingLyrics() throws Exception {
+
+        // Note - our test file has a lyrics tag, but lacks any
+        //  lyrics in the tags, so we can't test that bit
+        // TODO Find a better sample file
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3lyrics.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Test Title", content);
+        assertContains("Test Artist", content);
+        assertContains("Test Album", content);
+        assertContains("2008", content);
+        assertContains("Test Comment", content);
+        assertContains("Rock", content);
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        checkDuration(metadata, 1);
+    }
+    
+    @Test
+    public void testID3v2Frame() throws Exception {
+       byte[] empty = new byte[] {
+             0x49, 0x44, 0x33, 3, 1, 0,
+             0, 0, 0, 0
+       };
+       
+       assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
+       assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
+       
+       ID3v2Frame f = (ID3v2Frame)
+            ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+       assertEquals(3, f.getMajorVersion());
+       assertEquals(1, f.getMinorVersion());
+       assertEquals(0, f.getFlags());
+       assertEquals(0, f.getLength());
+       assertEquals(0, f.getData().length);
+       
+       assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
+       assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
+       assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
+    }
+
+    @Test
+    public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
+        assertEquals("2455.510986328125",
+                getXML("testMP3noid3.mp3").metadata.get(XMPDM.DURATION));
+    }
+    
+    /**
+     * This test will do nothing, unless you've downloaded the
+     *  mp3 file from TIKA-424 - the file cannot be
+     *  distributed with Tika.
+     * This test will check for the complicated set of ID3v2.4
+     *  tags.
+     */
+    @Test
+    public void testTIKA424() throws Exception {
+        Assume.assumeTrue(Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/test2.mp3") != null);
+
+        Metadata metadata = new Metadata();
+        String content = getText("test2.mp3", metadata);
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Plus loin vers l'ouest", content);
+       
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+    }
+    
+    /**
+     * This tests that we can handle without errors (but perhaps not
+     *  all content) a file with a very very large ID3 frame that
+     *  has been truncated before the end of the ID3 tags.
+     * In this case, it is a file with JPEG data in the ID3, which
+     *  is trunacted before the end of the JPEG bit of the ID3 frame.
+     */
+    @Test
+    public void testTIKA474() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testMP3truncated.mp3", metadata);
+
+        // Check we could get the headers from the start
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
+
+        assertContains("Girl you have no faith in medicine", content);
+        assertContains("The White Stripes", content);
+        assertContains("Elephant", content);
+        assertContains("2003", content);
+       
+        // File lacks any audio frames, so we can't know these
+        assertEquals(null, metadata.get("version"));
+        assertEquals(null, metadata.get("samplerate"));
+        assertEquals(null, metadata.get("channels"));
+    }
+
+    // TIKA-1024
+    @Test
+    public void testNakedUTF16BOM() throws Exception {
+        Metadata metadata = getXML("testNakedUTF16BOM.mp3").metadata;
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("", metadata.get(XMPDM.GENRE));
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
new file mode 100644
index 0000000..622dcf7
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.junit.After;
+import org.junit.Test;
+
+/**
+ * Test class for {@code MpegStream}.
+ */
+public class MpegStreamTest
+{
+    /** The stream to be tested. */
+    private MpegStream stream;
+
+    @After
+    public void tearDown() throws Exception
+    {
+        if (stream != null)
+        {
+            stream.close();
+        }
+    }
+
+    /**
+     * Tests whether the default test header can be found in a stream.
+     * 
+     * @param bos the stream
+     * @throws IOException if an error occurs
+     */
+    private void checkDefaultHeader(ByteArrayOutputStream bos)
+            throws IOException
+    {
+        ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
+        stream = new MpegStream(in);
+        AudioFrame header = stream.nextFrame();
+        assertNotNull("No header found", header);
+        assertEquals("Wrong MPEG version", AudioFrame.MPEG_V2,
+                header.getVersionCode());
+        assertEquals("Wrong layer", AudioFrame.LAYER_3, header.getLayer());
+        assertEquals("Wrong bit rate", 80000, header.getBitRate());
+        assertEquals("Wrong sample rate", 24000, header.getSampleRate());
+    }
+
+    /**
+     * Writes the given byte the given number of times into an output stream.
+     * 
+     * @param out the output stream
+     * @param value the value to write
+     * @param count the number of bytes to write
+     * @throws IOException if an error occurs
+     */
+    private static void writeBytes(OutputStream out, int value, int count)
+            throws IOException
+    {
+        for (int i = 0; i < count; i++)
+        {
+            out.write(value);
+        }
+    }
+
+    /**
+     * Writes a frame header in the given output stream.
+     * 
+     * @param out the output stream
+     * @param b2 byte 2 of the header
+     * @param b3 byte 3 of the header
+     * @param b4 byte 4 of the header
+     * @throws IOException if an error occurs
+     */
+    private static void writeFrame(OutputStream out, int b2, int b3, int b4)
+            throws IOException
+    {
+        out.write(0xFF);
+        out.write(b2);
+        out.write(b3);
+        out.write(b4);
+    }
+
+    /**
+     * Tests whether an audio frame header can be found somewhere in a stream.
+     */
+    @Test
+    public void testSearchNextFrame() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        writeBytes(bos, 0xFF, 32);
+        writeBytes(bos, 0, 16);
+        writeBytes(bos, 0xFF, 8);
+        bos.write(0xF3);
+        bos.write(0x96);
+        bos.write(0);
+        checkDefaultHeader(bos);
+    }
+
+    /**
+     * Tests whether invalid frame headers are detected and skipped.
+     */
+    @Test
+    public void testSearchNextFrameInvalid() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        writeFrame(bos, 0xEB, 0x96, 0);
+        writeFrame(bos, 0xF9, 0x96, 0);
+        writeFrame(bos, 0xF3, 0, 0);
+        writeFrame(bos, 0xF3, 0xF0, 0);
+        writeFrame(bos, 0xF3, 0x7C, 0);
+        writeFrame(bos, 0xF3, 0x96, 0);
+        checkDefaultHeader(bos);
+    }
+
+    /**
+     * Tests a search for another frame which is interrupted because the stream
+     * ends.
+     */
+    @Test
+    public void testSeachNextFrameEOS() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        bos.write(0xFF);
+        bos.write(0xFF);
+        bos.write(0xF3);
+        bos.write(0x96);
+        ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
+        stream = new MpegStream(in);
+        assertNull("Got a frame", stream.nextFrame());
+    }
+
+    /**
+     * Tries to skip a frame if no current header is available.
+     */
+    @Test
+    public void testSkipNoCurrentHeader() throws IOException
+    {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        bos.write("This is a test".getBytes(UTF_8));
+        ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
+        stream = new MpegStream(in);
+        assertFalse("Wrong result", stream.skipFrame());
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/test2.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/test2.mp3
new file mode 100644
index 0000000..698cbaf
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/test2.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAIFF.aif b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAIFF.aif
new file mode 100644
index 0000000..97eac1d
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAIFF.aif differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAU.au b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAU.au
new file mode 100644
index 0000000..20d1bd2
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testAU.au differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testFLV.flv b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testFLV.flv
new file mode 100644
index 0000000..d35e9bb
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testFLV.flv differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3i18n.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3i18n.mp3
new file mode 100644
index 0000000..0f25370
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3i18n.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v1_v2.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v1_v2.mp3
new file mode 100644
index 0000000..b78a1a3
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v1_v2.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v2.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v2.mp3
new file mode 100644
index 0000000..ac96bec
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3id3v2.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3noid3.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3noid3.mp3
new file mode 100644
index 0000000..f087903
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3noid3.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3truncated.mp3 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3truncated.mp3
new file mode 100644
index 0000000..d8ab515
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP3truncated.mp3 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4_truncated.m4a b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4_truncated.m4a
new file mode 100644
index 0000000..31fdef4
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4_truncated.m4a differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/pom.xml
similarity index 85%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/pom.xml
index 54667aa..0f79305 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/pom.xml
@@ -21,21 +21,22 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-parser-audiovideo-module</artifactId>
+    <artifactId>tika-parser-cad-module</artifactId>
+
 
     <dependencies>
         <dependency>
-            <groupId>org.tallison</groupId>
-            <artifactId>isoparser</artifactId>
-            <version>${isoparser.version}</version>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi</artifactId>
+            <version>${poi.version}</version>
         </dependency>
-
     </dependencies>
     <build>
         <plugins>
@@ -45,7 +46,7 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            <Automatic-Module-Name>org.apache.tika.parser.audiovideo</Automatic-Module-Name>
+                            <Automatic-Module-Name>org.apache.tika.parser.code</Automatic-Module-Name>
                         </manifestEntries>
                     </archive>
                 </configuration>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
new file mode 100644
index 0000000..a7c8bd0
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ *  format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 4659638314375035178L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
+    public static final String PRT_MIME_TYPE = "application/x-prt";
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+       return SUPPORTED_TYPES;
+    }
+
+    /**
+     * How long do we allow a text run to claim to be, before we
+     * decide we're confused and it's not really text after all?
+     */
+    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+    
+    /*
+     * Text types:
+     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
+     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
+     *   (anything)  e0 3f sz sz TEXT    *view name*
+     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries* 
+     *   
+     *  Note - all text is null terminated
+     */
+      
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, 
+          ParseContext context) throws IOException, SAXException, TikaException {
+       
+       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+       Last5 l5 = new Last5();
+       int read;
+       
+       // Try to get the creation date, which is YYYYMMDDhhmm
+       byte[] header = new byte[30];
+       IOUtils.readFully(stream, header);
+       byte[] date = new byte[12];
+       IOUtils.readFully(stream, date);
+       
+       String dateStr = new String(date, US_ASCII);
+       if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
+          String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
+             "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
+             dateStr.substring(10, 12) + ":00";
+          metadata.set(TikaCoreProperties.CREATED, formattedDate);
+          // TODO Metadata.DATE is used as modified, should it be here?
+          metadata.set(TikaCoreProperties.CREATED, formattedDate);
+       }
+       metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+       
+       // The description, if set, is the next up-to-500 bytes
+       byte[] desc = new byte[500];
+       IOUtils.readFully(stream, desc);
+       String description = extractText(desc, true);
+       if(description.length() > 0) {
+          metadata.set(TikaCoreProperties.DESCRIPTION, description);
+       }
+       
+       // Now look for text
+       while( (read = stream.read()) > -1) {
+          if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
+             int nread = stream.read();
+             if(nread == 0x3f || nread == 0xbf) {
+                // Looks promising, check back for a suitable value
+                if(read == 0xe3 && nread == 0x3f) {
+                   if(l5.is33()) {
+                      // Bingo, note text
+                      handleNoteText(stream, xhtml);
+                   }
+                } else if(l5.is00()) {
+                   // Likely view name
+                   handleViewName(read, nread, stream, xhtml, l5);
+                }
+             }
+          } else {
+             l5.record(read);
+          }
+       }
+    }
+    
+    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) 
+    throws IOException, SAXException, TikaException {
+       // Ensure we have the right padding text
+       int read;
+       for(int i=0; i<10; i++) {
+          read = stream.read();
+          if(read >= 0 && read <= 0x0f) {
+             // Promising
+          } else {
+             // Wrong, false detection
+             return;
+          }
+       }
+       read = stream.read();
+       if(read != 0x1f) {
+          // Wrong, false detection
+          return;
+       }
+       
+       int length = EndianUtils.readUShortLE(stream);
+       if(length <= MAX_SANE_TEXT_LENGTH) {
+          // Length sanity check passed
+          handleText(length, stream, xhtml);
+       }
+    }
+    
+    private void handleViewName(int typeA, int typeB, InputStream stream, 
+          XHTMLContentHandler xhtml, Last5 l5) 
+    throws IOException, SAXException, TikaException {
+       // Is it 8 byte zero padded?
+       int maybeLength = EndianUtils.readUShortLE(stream);
+       if(maybeLength == 0) {
+          // Check the next 6 bytes too
+          for(int i=0; i<6; i++) {
+             int read = stream.read();
+             if(read >= 0 && read <= 0x0f) {
+                // Promising
+             } else {
+                // Wrong, false detection
+                return;
+             }
+          }
+          
+          byte[] b2 = new byte[2];
+          IOUtils.readFully(stream, b2);
+          int length = EndianUtils.getUShortLE(b2);
+          if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+             // Length sanity check passed
+             handleText(length, stream, xhtml);
+          } else {
+             // Was probably something else
+             l5.record(b2[0]);
+             l5.record(b2[1]);
+          }
+       } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+          // Looks like it's straight into the text
+          handleText(maybeLength, stream, xhtml);
+       }
+    }
+    
+    private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) 
+    throws IOException, SAXException, TikaException {
+       byte[] str = new byte[length];
+       IOUtils.readFully(stream, str);
+       if(str[length-1] != 0) {
+          // Not properly null terminated, must be wrong
+          return;
+       }
+       
+       String text = extractText(str, false);
+       
+       xhtml.startElement("p");
+       xhtml.characters(text);
+       xhtml.endElement("p");
+    }
+    
+    /**
+     * Does our best to turn the bytes into text
+     */
+    private String extractText(byte[] data, boolean trim) throws TikaException {
+       // The text is always stored null terminated, but sometimes
+       //  may have extra null padding too
+       int length = data.length - 1;
+       if(trim) {
+          for(int i=0; i<data.length; i++) {
+             if(data[i] == 0) {
+                length = i;
+                break;
+             }
+          }
+       }
+       
+       // We believe that the text is basically stored as CP437
+       // That said, there are a few characters slightly wrong for that...
+       String text;
+       try {
+          text = new String(data, 0, length, "cp437");
+       } catch(UnsupportedEncodingException e) {
+          throw new TikaException("JVM Broken, core codepage CP437 missing!");
+       }
+       
+       // Fix up the known character issues
+       text = text.replace("\u03C6","\u00D8");
+
+       // All done, as best as we can!
+       return text;
+    }
+    
+    /**
+     * Provides a view on the previous 5 bytes
+     */
+    private static class Last5 {
+       byte[] data = new byte[5];
+       int pos = 0;
+       
+       private void record(int b) {
+          data[pos] = (byte)b;
+          pos++;
+          if(pos >= data.length) {
+             pos = 0;
+          }
+       }
+       
+       private byte[] get() {
+          byte[] ret = new byte[5];
+          for(int i=0; i<ret.length; i++) {
+             int p = pos - i;
+             if(p < 0) { p += ret.length; }
+             ret[i] = data[p];
+          }
+          return ret;
+       }
+       
+       private boolean is33() {
+          byte[] last5 = get();
+          for(byte b : last5) {
+             if(b != 0x33) return false;
+          }
+          return true;
+       }
+       
+       private boolean is00() {
+          byte[] last5 = get();
+          for(byte b : last5) {
+             if(b != 0x00) return false;
+          }
+          return true;
+       }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG2010.dwg b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG2010.dwg
new file mode 100644
index 0000000..ee17cb1
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG2010.dwg differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/pom.xml
index 71e105c..a2cbcbe 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/pom.xml
@@ -21,9 +21,10 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
new file mode 100644
index 0000000..481046f
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -3531388963354454357L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("java-vm"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        new XHTMLClassVisitor(handler, metadata).parse(stream);
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
new file mode 100644
index 0000000..2db8bef
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mat;
+
+//JDK imports
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+import com.jmatio.io.MatFileHeader;
+import com.jmatio.io.MatFileReader;
+import com.jmatio.types.MLArray;
+import com.jmatio.types.MLStructure;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//JMatIO imports
+
+
+public class MatParser extends AbstractParser {
+
+    static {
+        //make sure that this is set to false
+        MatFileReader.setAllowObjectDeserialization(false);
+    }
+
+    public static final String MATLAB_MIME_TYPE =
+            "application/x-matlab-data";
+
+    private final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-matlab-data"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context){
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        //Set MIME type as Matlab
+        metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
+        TemporaryResources tmp =
+                TikaInputStream.isTikaInputStream(stream) ? null :
+                        new TemporaryResources();
+        try {
+            // Use TIS so we can spool a temp file for parsing.
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+            //Extract information from header file
+            MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file
+
+            MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information
+
+            // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar  2 23:41:57 2014"
+            String[] parts = hdr.getDescription().split(","); // Break header information into its parts
+
+            if (parts[2].contains("Created")) {
+                int lastIndex1 = parts[2].lastIndexOf("Created on:");
+                String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
+                metadata.set("createdOn", dateCreated);
+            }
+
+            if (parts[1].contains("Platform")) {
+                int lastIndex2 = parts[1].lastIndexOf("Platform:");
+                String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
+                metadata.set("platform" , platform);
+            }
+
+            if (parts[0].contains("MATLAB")) {
+                metadata.set("fileType", parts[0]);
+            }
+
+            // Get endian indicator from header file
+            String endianBytes = new String(hdr.getEndianIndicator(), UTF_8); // Retrieve endian bytes and convert to string
+            String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to characters to string
+            metadata.set("endian", endianCode);
+
+            //Text output	
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+            xhtml.newline();
+            //Loop through each variable
+            for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
+                String varName = entry.getKey();
+                MLArray varData = entry.getValue();
+
+                xhtml.element("p", varName + ":" + String.valueOf(varData));
+
+                // If the variable is a structure, extract variable info from structure
+                if (varData.isStruct()){
+                    MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
+                    xhtml.startElement("ul");
+                    xhtml.newline();
+                    for (MLArray element : mlStructure.getAllFields()){
+                        xhtml.startElement("li");
+                        xhtml.characters(String.valueOf(element));
+
+                        // If there is an embedded structure, extract variable info.
+                        if (element.isStruct()){
+                            xhtml.startElement("ul");
+                            // Should this actually be a recursive call?
+                            xhtml.element("li", element.contentToString());
+                            xhtml.endElement("ul");
+                        }
+
+                        xhtml.endElement("li");
+                    }
+                    xhtml.endElement("ul");
+                }
+            }
+            xhtml.endDocument();
+        } catch (IOException e) {
+            throw new TikaException("Error parsing Matlab file with MatParser", e);
+        } finally {
+            if (tmp != null) {
+                tmp.dispose();
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..d15be7a
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,21 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.asm.ClassParser
+org.apache.tika.parser.code.SourceCodeParser
+org.apache.tika.parser.executable.ExecutableParser
+org.apache.tika.parser.mat.MatParser
+org.apache.tika.parser.sas.SAS7BDATParser
+
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/AutoDetectParser.class b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/AutoDetectParser.class
new file mode 100644
index 0000000..ad124bc
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/AutoDetectParser.class differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/breidamerkurjokull_radar_profiles_2009.mat b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/breidamerkurjokull_radar_profiles_2009.mat
index 57884df..7e82b14 100644
Binary files a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/breidamerkurjokull_radar_profiles_2009.mat and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/breidamerkurjokull_radar_profiles_2009.mat differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/test-columnar.sas7bdat b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/test-columnar.sas7bdat
new file mode 100644
index 0000000..f6cab63
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/test-columnar.sas7bdat differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testC.c b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testC.c
new file mode 100644
index 0000000..ebc72c8
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testC.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main ()
+{
+  printf ("Apache Tika!\n");
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testJS_HTML.js b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testJS_HTML.js
new file mode 100644
index 0000000..a362198
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testJS_HTML.js
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+var places = new Array();
+
+places[0] = {
+   'name': 'Oxford', lat: 51.75222, lng: -1.25596,
+   'id': 'map_1',
+}
+places[1] = {
+   'name': 'Oxford', lat: 41.43399, lng: -73.11678,
+   'id': 'map_2',
+}
+places[2] = {
+   'name': 'Oxford', lat: -43.3, lng: 172.18333,
+   'id': 'map_3',
+}
+places[3] = {
+   'name': 'Oxford', lat: 33.619, lng: -83.86741,
+   'id': 'map_4',
+}
+places[4] = {
+   'name': 'Oxford', lat: 44.13174, lng: -70.49311,
+   'id': 'map_5',
+}
+places[5] = {
+   'name': 'Oxford', lat: 39.78539, lng: -75.97883,
+   'id': 'map_6',
+}
+places[6] = {
+   'name': 'Oxford', lat: 40.51976, lng: -87.24779,
+   'id': 'map_7',
+}
+places[7] = {
+   'name': 'Oxford', lat: 45.73345, lng: -63.86542,
+   'id': 'map_8',
+}
+places[8] = {
+   'name': 'Oxford', lat: 42.44202, lng: -75.59769,
+   'id': 'map_9',
+}
+places[9] = {
+   'name': 'Oxford', lat: 40.80315, lng: -74.98962,
+   'id': 'map_10',
+}
+
+function drawMaps() {
+   if (GBrowserIsCompatible()) {
+      for(var i in places) {
+         var p = places[i];
+         var div = document.getElementById(p['id']);
+
+         div.style.display = "block";
+         div.parentNode.style.marginBottom = "35px";
+
+         var map = new GMap2(div);
+         map.setCenter(new GLatLng(p['lat'], p['lng']), 8);
+
+         var m = new GMarker( 
+            new GLatLng(p['lat'], p['lng']),
+            {title: p['name']}
+         );
+         map.addOverlay(m);
+      }
+   } else {
+      document.write("<!doctype><html><body><h1>Unsupported Browser</h1></body></html>");
+   }
+}
+
+var t;
+$(document).ready(function(){
+      t = setTimeout(function() {
+         clearTimeout(t);
+         drawMaps();
+      }, 15*1000);
+});
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32be b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32be
new file mode 100755
index 0000000..3c67dce
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32be differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32le b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32le
new file mode 100755
index 0000000..445e6a7
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-mips-32le differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-ppc-32be b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-ppc-32be
new file mode 100755
index 0000000..2b4e6fe
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-ppc-32be differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-x86-32 b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-x86-32
new file mode 100755
index 0000000..784e4be
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testLinux-x86-32 differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB.m b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB.m
index e69de29..60871f7 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB.m
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB.m
@@ -0,0 +1,4 @@
+function helloworld
+fprintf('Hello, World!\n')
+disp('Hello, World!');
+end
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_barcast.m b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_barcast.m
new file mode 100644
index 0000000..5cad76d
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_barcast.m
@@ -0,0 +1,383 @@
+%% CONTROL CODE FOR FULLY BAYESIAN SPATIO-TEMPORAL TEMPERATURE RECONSTRUCTION
+%EVERYTHING IS MODULAR TO ALLOW FOR EASY DEBUGGING AND ADAPTATION
+% _vNewModel_Oct08: change the formalism to reflect new model (Beta_1 now
+% normal). Allows for multiple proxies
+clear all; close all;
+%SET MATLAB'S CURRENT DIRECTORY TO HERE. 
+% set the priors and the inital values for the MCMC sampler
+Prior_pars_vNewModel
+Initial_par_vals_vNewModel
+%% Set the seed of the random number generators
+randn('state', sum((1000+600)*clock))
+rand('state', sum((1000+800)*clock))
+
+%% load the data
+cd TestData
+load BARCAST_INPUT_vNewMeth1
+%break it apart
+Locs=BARCAST_INPUT.Master_Locs;
+N_Locs=length(Locs(:,1)); %Number of locations:
+timeline=[BARCAST_INPUT.Data_timeline(1)-1, BARCAST_INPUT.Data_timeline];
+N_Times=length(timeline)-1; %Number of DATA times
+loc_areas=BARCAST_INPUT.Areas;
+Inds_GridLocs_Central=BARCAST_INPUT.Inds_Central;
+
+%get the number of proxy types:
+N_PT=length(fieldnames(BARCAST_INPUT))-5;
+
+%stack the three data matrices, one on top of the other
+%the first N_Locs ROWS are the Inst, the next N_Locs ROWS the first proxy
+%type, the next the third. . . . .. Each column a year. The first
+%corresponds to the SECOND entry in timeline. 
+Data_ALL=BARCAST_INPUT.Inst_Data;
+for kk=1:1:N_PT
+    tp=eval(['BARCAST_INPUT.Prox_Data', num2str(kk)]);
+    Data_ALL=[Data_ALL; tp];
+end
+
+% % % % All_locs_wInd=BARCAST_INPUT.All_locs_wInd;
+% % % % lon_lat_area=BARCAST_INPUT.lon_lat_area;
+% % % % DATA_Mat=BARCAST_INPUT.DATA_Mat;
+% % % % DATA_Mat_locs=BARCAST_INPUT.DATA_Mat_locs;
+% % % % Inds_GridLocs_Central=BARCAST_INPUT.Inds_GridLocs_Central;
+% % % % timeline=BARCAST_INPUT.timeline;
+% % % % clear BARCAST_INPUT
+
+%Priors and MH jumping parameters, from Prior_pars_vNewModel
+load PRIORS_vNewMeth1
+load MHpars_vNewMeth1
+%Initial values from Initial_par_vals_vNewModel
+load INITIAL_VALS_vNewMeth1
+
+%The Order of THE SCALAR parameters WILL ALWAYS thus:
+%1 = alpha, the AR(1) coefficient
+%2 = mu, the constant par in the linear mean of the AR(1) process
+%3 = sigma2, the partial sill in the spatial covariance matrix
+%4 = phi, the range parameter in the spatial covariance matrix
+%5 = tau2_I, the Inst measurement error
+%6 = tau2_P, the measurement error, first PROX type
+%7 = Beta_1, the scaling par in the  first P observation equation
+%8 = Beta_0, the additive par in the first P observation equation
+%and, if there is second proxy type
+%9  = tau2_P_2, the measurement error, second PROX type
+%10 = Beta_1, the scaling par in the  second P observation equation
+%11 = Beta_0, the additive par in the second P observation equation
+%and, if there is third proxy type . . . . 
+
+%A NOTE ON GAMMA NOTATION. WE USE THE NOTATION OF Gelman et al, "Bayesian
+%Data Analysis", WHERE GAMMA PARAMETERS ALPHA, BETA)==(SHAPE, INVERSE SCALE). 
+%THE RANDRAW.M CODE USES (A,B)==(SHAPE, SCALE), AND THE CALL IS RANDRAW('GAMMA', [M,B,A], SAMPLESIZE), 
+%WHERE M IS THE LOCATION (NOT NEEDED). SO IN THE NOTATION OF GELMAN ET AT, THE CALL IS
+%RANDRAW('GAMMA', [0,1/BETA,ALPHA], SAMPLESIZE). 
+%For example,
+%RANDRAW('GAMMA', [0,1/PRIORS.sigma2(2),PRIORS.sigma2(1)], 1), AND ETC. 
+
+%switch back tot he main directory
+cd ..
+%% SET a few parameters
+%Number of iterations of the complete sampler
+Sampler_Its=2000;
+
+%Number of times to update only the temperature array before beginning to
+%update the other parameters
+pre_Sampler_Its=500; 
+
+
+%% Areal weights vector for averaging the temperatures at each year
+%note that some of the elments of the temeprature are given 0 weight -
+%outside the prediction bounds. This is based on an input of the area of
+%each gridbox
+SpaceWeight=loc_areas/sum(loc_areas);
+%and for the central region/region of interest
+Areas_Central=zeros(1,N_Locs);
+Areas_Central(Inds_GridLocs_Central)=loc_areas(Inds_GridLocs_Central);
+SpaceWeight_Central=Areas_Central/sum(Areas_Central);
+
+%(In some applications, the goal might be to estimate the block average
+%over a subset of the locations in the reconstruction. For example, the
+%goal might be to reconstruct temperatures in Maine, but proxy records from
+%NH are incldued in the analysis, as they help to constrain temperatures in
+%Maine. SO some of the weights are, in this case, set to zero). 
+
+
+%% CALCULATE FIXED QUANTITIES (DO NOT DEPEND ON UNKOWN PARAMETERS)
+
+%The matrix of distances between every possible pair of points, (I,P,R)
+All_DistMat=EarthDistances(Locs);
+
+%The H(t) selection matrix. 
+%Basically, H(t) tells us which Inst and prox
+%locations have measurements for a given year. So: define H(t) for each
+%year as an indicator vector, and thus HH a matrix such that each column is
+%the indicator vector for that year. In other words, this is the complete
+%indicator matrix for the presence of data::
+%1=YES Measurement;
+%0=NO  Measurement
+%Simply a ZERO wherever there is a NaN in Data_ALL, and a ONE whereever
+%this is a value
+HH_SelectMat=ones(size(Data_ALL))-isnan(Data_ALL);
+
+%The total number of Inst/Prox Observations are needed for several
+%conditional posteriors, and can be calculated from the HH_SelectMat:
+M_InstProx=NaN(1+N_PT,1);
+%vectot: first the total number of inst obsm then the total number of each
+%prox type, in order.
+%Inst:
+M_InstProx(1)=sum(sum(HH_SelectMat(1:1:N_Locs, :)));
+%Prox:
+for kk=1:1:N_PT
+    M_InstProx(kk+1)=sum(sum(HH_SelectMat(kk*N_Locs+1:1:(kk+1)*N_Locs, :)));
+end
+
+%% Set the initial values of the Field matrix and Current Parameter Vector
+% These will be updated and then saved at each iteration of the sampler.
+% They are initially filled with the values from INITIAL_VALS.
+% Paramter/field values at each step of the gibbs sampler are taken from
+% these objects, and new draws override the current entries. This ensures
+% that each step of the Gibbs sampler is ALWAYS using the most recent set of ALL 
+% parameters, without having to deal with +/-1 indices.
+
+%Array of the estimated true temperature values, set to the initial values:  
+Temperature_MCMC_Sampler=INITIAL_VALS.Temperature;
+%Order: All I, P with locs common to I, Rest of the P, R.
+%In other words, ordered the same as InstProx_locs, then with Rand_locs
+%added on
+%note that
+%[Inst_locs; Prox_locs] = InstProx_locs([Inst_inds,Prox_inds],:)
+%SO: Temperature_MCMC_Sampler([Inst_inds,Prox_inds], KK) extracts the
+%elements that can be compared to the corresponding time of DATA_Mat
+
+% Current values of the scalar parameters
+INITIAL_SCALAR_VALS=rmfield(INITIAL_VALS, 'Temperature');
+CURRENT_PARS=cell2mat(struct2cell(INITIAL_SCALAR_VALS));
+
+% OR LOAD TRUE VALUES - FOR TESTING
+% load TestData\Pars_TRUE
+% CURRENT_PARS=Pars_TRUE';
+% 
+% load TestData\TrueTemps_v1
+% Temperature_MCMC_Sampler=Temperature_Matrix;
+
+%% DEFINE EMPTY MATRICES that will be filled with the sampler
+%DEFINE the empty parameter matrix:
+N_Pars=length(CURRENT_PARS);
+Paramters_MCMC_Samples=NaN(N_Pars, Sampler_Its);
+%The empty matrix of the samples of the blockaverage timeseries:
+BlockAve_MCMC_Samples=NaN(N_Times+1, pre_Sampler_Its+Sampler_Its);
+%and the central/target portion
+BlockAve_Central_MCMC_Samples=NaN(N_Times+1, pre_Sampler_Its+Sampler_Its);
+%NOTE the initial values of the parameters, field, and block averages will
+%NOT be saved. So the first item in all matrices/arrays are the results
+%after the first iteration of the sampler
+
+%IN this case, as the amount of data is small, we are able to deal
+%with the whole array of space time draws. In applications with larger
+%data, this is not possible (memory overflow). 
+Temperature_ARRAY=NaN(N_Locs, N_Times+1, pre_Sampler_Its+Sampler_Its);
+
+
+%% CALCULATE PARAMETER DEPENDENT QUANTITIES
+%that are used several times in the sampler
+%
+%The idea: calculate the quantities with the initial parameter values, then
+%update as soon as possible, leaving the variablle name the same
+%
+%calculate the initial spatial correlation matrix, and its inverse
+%these are needed several times.
+%AS SOON as phi is updated, this is updated, ensuring that the
+%correlation matrix and its inverse are always up to date, regardless of
+%the order of the sampling below.
+CURRENT_spatial_corr_mat=exp(-CURRENT_PARS(4)*All_DistMat);
+CURRENT_inv_spatial_corr_mat=inv(CURRENT_spatial_corr_mat);
+
+%% To speed up the code 
+%1. Find the UNIQUE missing data patterns, number them.
+%2. Index each year by the missing data pattern.
+%3. For each missing data pattern, calculate the inverse and square root of
+%the conditional posterior covariance of a T_k, and stack them
+%4. Rewrite the T_k_Updater to simply call these matrices. 
+%This reduces the number of matrix inversions for each FULL iteration of
+%the sampler to the number of UNIQUE data patterns, and reduces the number
+%for the pre iterations to 2. 
+
+U_Patterns=unique(HH_SelectMat', 'rows');
+%create an index vector that gices, for each year, the number of the
+%corresponding pattern in U_Patterns
+%Basically - HH_SelectMat can be represented by U_Patterns and this index vector:
+Pattern_by_Year=NaN(N_Times,1);
+for kk=1:1:length(U_Patterns(:,1));
+    dummy=ismember(HH_SelectMat', U_Patterns(kk,:), 'rows');
+    Pattern_by_Year(find(dummy==1))=kk;
+end
+
+%Input the CURRENT_PARS vector and etc into Covariance_Patterns, which returns two 3d
+%arrays: the covariance amtrix for each missing data patter (for
+%the mean calculation) and the squre root of the covariance matrix (to make
+%the draw). 
+[CURRENT_COV_ARRAY, CURRENT_SQRT_COV_ARRAY]=Covariance_Patterns(U_Patterns, CURRENT_PARS, CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+
+
+
+%% In an attempt to speed convergence of the variance paramters
+% we will uptate only the true temperature array for a number of
+% iterations, and then add the updating of the other parameters. This is to
+% prevent the model from requiring large variances to fit the observations
+% to the data.
+%timertimer=NaN;
+for samples=1:1:pre_Sampler_Its
+    tic;
+    %% SAMPLE T(0): True temperature the year before the first measurement.
+    Temperature_MCMC_Sampler(:,1)=T_0_Updater_vNM(PRIORS.T_0, Temperature_MCMC_Sampler(:,2), CURRENT_PARS, CURRENT_inv_spatial_corr_mat);
+    
+    %% SAMPLE T(1), . . ., T(last-1). Recall that the T matrix starts at time=0, while the W matrix starts at time=1
+    for Tm=2:1:N_Times
+        Temperature_MCMC_Sampler(:,Tm)=T_k_Updater_vFAST(Temperature_MCMC_Sampler(:, Tm-1), Temperature_MCMC_Sampler(:,Tm+1), Data_ALL(:,Tm-1), CURRENT_PARS, U_Patterns(Pattern_by_Year(Tm-1),:),CURRENT_COV_ARRAY(:,:,Pattern_by_Year(Tm-1)),CURRENT_SQRT_COV_ARRAY(:,:,Pattern_by_Year(Tm-1)),CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+    end
+    %This is a SLOW step, because it is actually N_Times-1 steps. . . 
+
+    %% SAMPLE T(last)
+    Temperature_MCMC_Sampler(:,N_Times+1)=T_last_Updater_vNM(Temperature_MCMC_Sampler(:, N_Times), Data_ALL(:,N_Times), HH_SelectMat(:, N_Times), CURRENT_PARS, CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+           
+    %% Fill in the next iteration of the BlockAve_MCMC_Samples matrix: 
+	BlockAve_MCMC_Samples(:,samples)=(SpaceWeight*Temperature_MCMC_Sampler)';
+    BlockAve_Central_MCMC_Samples(:, samples)=(SpaceWeight_Central*Temperature_MCMC_Sampler)';
+    %Fill in the next slice of the space-time field draw array
+    Temperature_ARRAY(:,:,samples)=Temperature_MCMC_Sampler;
+    
+    %save the current draw of the space-time temp matrix
+    %save(['TestData\FieldDraws\Temp_MCMC_vNM_Test_PreStep' num2str(samples)],'Temperature_MCMC_Sampler');
+
+    timertimer=toc;
+	disp(['Working on pre-MCMC iteration ', num2str(samples), ' of ', num2str(pre_Sampler_Its), '. Last iteration took ', num2str(timertimer), ' seconds.'])
+
+end
+
+timertimer=NaN;
+%% RUN THE SAMPLER
+for samples=1:1:Sampler_Its
+    
+    tic
+    %% SAMPLE T(0): True temperature the year before the first measurement.
+    Temperature_MCMC_Sampler(:,1)=T_0_Updater_vNM(PRIORS.T_0, Temperature_MCMC_Sampler(:,2), CURRENT_PARS, CURRENT_inv_spatial_corr_mat);
+    
+    %% SAMPLE T(1), . . ., T(last-1). Recall that the T matrix starts at time=0, while the W matrix starts at time=1
+    for Tm=2:1:N_Times
+        Temperature_MCMC_Sampler(:,Tm)=T_k_Updater_vFAST(Temperature_MCMC_Sampler(:, Tm-1), Temperature_MCMC_Sampler(:,Tm+1), Data_ALL(:,Tm-1), CURRENT_PARS, U_Patterns(Pattern_by_Year(Tm-1),:),CURRENT_COV_ARRAY(:,:,Pattern_by_Year(Tm-1)),CURRENT_SQRT_COV_ARRAY(:,:,Pattern_by_Year(Tm-1)),CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+    end
+    %This is a SLOW step, because it is actually N_Times-1 steps. . . 
+
+    %% SAMPLE T(last)
+    Temperature_MCMC_Sampler(:,N_Times+1)=T_last_Updater_vNM(Temperature_MCMC_Sampler(:, N_Times), Data_ALL(:,N_Times), HH_SelectMat(:, N_Times), CURRENT_PARS, CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+    
+    %% SAMPLE AR(1) coefficient    
+    New_Alpha=Alpha_Updater_vNM(PRIORS.alpha, Temperature_MCMC_Sampler, CURRENT_PARS, CURRENT_inv_spatial_corr_mat);
+    CURRENT_PARS(1)=New_Alpha;
+    clear New_Alpha
+
+    %% SAMPLE AR(1) mean constant parameter, mu:
+    New_mu=Mu_Updater_vNM(PRIORS.mu, Temperature_MCMC_Sampler, CURRENT_PARS, CURRENT_inv_spatial_corr_mat);
+    CURRENT_PARS(2)=New_mu;
+    clear New_AR_mean_mu
+    
+    %% SAMPLE Partial Sill of the spatial covaraince martrix
+    New_sigma2=Sigma2_Updater_vNM(PRIORS.sigma2, Temperature_MCMC_Sampler, CURRENT_PARS, CURRENT_inv_spatial_corr_mat);
+    %ARTIFICIALLY put a cieling at, say, 5.
+    %CHECK a posterior that, one the algorithm has converged, ALL draws are
+    %lower than this. 
+    CURRENT_PARS(3)=min(5, New_sigma2);
+    clear New_sigma2
+	
+    %% SAMPLE Range Parameter of the spatial covaraince martrix (METROPOLIS)
+    % This also updates the spatial corelation matrix and its inverse
+    [New_phi, New_scm, New_iscm]=Phi_Updater_vNM(PRIORS.phi, Temperature_MCMC_Sampler, CURRENT_PARS, CURRENT_spatial_corr_mat, CURRENT_inv_spatial_corr_mat, All_DistMat, MHpars.log_phi);
+    CURRENT_PARS(4)=New_phi;
+    CURRENT_spatial_corr_mat=New_scm;
+    CURRENT_inv_spatial_corr_mat=New_iscm;
+    clear New_phi New_iscm New_scm
+    
+    %% SAMPLE Instrumental measurement error
+    New_tau2_I=tau2_I_Updater_vNM(PRIORS.tau2_I, Temperature_MCMC_Sampler, Data_ALL, N_Locs, M_InstProx(1));
+    %ARTIFICIALLY put a cieling at, say, 5.
+    %CHECK a posterior that, one the algorithm has converged, ALL draws are
+    %lower than this. 
+    CURRENT_PARS(5)=min(5, New_tau2_I);
+    clear New_tau2_I
+    
+    
+    
+    %% NEED TO LOOP THE SAMPLING OF THESE THREE PARAMETERS
+    for Pnum=1:1:N_PT
+        %curtail the CURRENT_PARS vector to only include the pars for one
+        %proxy type at a time:
+        CURRENT_PARS_Brief=[CURRENT_PARS(1:1:5); CURRENT_PARS([6:1:8]+(Pnum-1)*3)];
+        %Similarily exract each type of proxy data:
+        Prox_Data_Brief=eval(['BARCAST_INPUT.Prox_Data', num2str(Pnum)]);
+
+        %% SAMPLE Proxy measurement error
+        New_tau2_P=tau2_P_Updater_vNM(eval(['PRIORS.tau2_P_', num2str(Pnum)]), Temperature_MCMC_Sampler, Prox_Data_Brief, CURRENT_PARS_Brief, M_InstProx(Pnum+1));
+        %ARTIFICIALLY put a cieling at, say, 50.
+        %CHECK a posterior that, one the algorithm has converged, ALL draws are
+        %lower than this.
+        CURRENT_PARS_Brief(6)=min(10, New_tau2_P);
+        clear New_tau2_P
+
+        %% SAMPLE Scaling constant in the proxy observation equation
+        New_beta_1=Beta_1_Updater_vNM(eval(['PRIORS.Beta_1_', num2str(Pnum)]), Temperature_MCMC_Sampler, Prox_Data_Brief, CURRENT_PARS_Brief);
+        CURRENT_PARS_Brief(7)=New_beta_1;
+        clear New_beta_1
+
+        %% SAMPLE Additive constant in the proxy observation equation
+        New_Beta_0=Beta_0_Updater_vNM(eval(['PRIORS.Beta_0_', num2str(Pnum)]), Temperature_MCMC_Sampler, Prox_Data_Brief, CURRENT_PARS_Brief, M_InstProx(Pnum+1));
+        CURRENT_PARS_Brief(8)=New_Beta_0;
+        clear New_Beta_0
+
+        CURRENT_PARS([6:1:8]+(Pnum-1)*3)=CURRENT_PARS_Brief(6:1:8);
+
+    end
+    
+    %% UPDATE the covariance arrays used in the T_k_Updater step
+    [CURRENT_COV_ARRAY, CURRENT_SQRT_COV_ARRAY]=Covariance_Patterns(U_Patterns, CURRENT_PARS, CURRENT_inv_spatial_corr_mat, N_Locs, N_PT);
+
+    
+    %% UPDATE THE VARIOUS MATRICES, SAVE CURRENT TEMPERTAURE MATRIX
+    %update the Paramters_MCMC_Samples matrix:
+    Paramters_MCMC_Samples(:, samples)=CURRENT_PARS;
+    %CURRENT_PARS is not cleared: it is, after all, the current parameter
+    %vector. 
+    
+    %Fill in the next iteration of the BlockAve_MCMC_Samples matrix:
+    BlockAve_MCMC_Samples(:, pre_Sampler_Its+samples)=(SpaceWeight*Temperature_MCMC_Sampler)';
+    BlockAve_Central_MCMC_Samples(:, pre_Sampler_Its+samples)=(SpaceWeight_Central*Temperature_MCMC_Sampler)';
+
+    %add the new draw of the space-time temp matrix
+    Temperature_ARRAY(:,:,pre_Sampler_Its+samples)=Temperature_MCMC_Sampler;
+    %save the current draw of the space-time temp matrix
+    %save(['TestData\FieldDraws\Temp_MCMC_vNM_Test_Step' num2str(samples)],'Temperature_MCMC_Sampler');    
+    
+    %SAVE the matrix of parameter vector draws and the matrix of block
+    %average vectors. (This way, even if the code is stopped prematurely,
+    %we get something)
+    %cd TestData
+    %cd FieldDraws
+    %save TestData\FieldDraws\Paramters_MCMC_Samples_vNM Paramters_MCMC_Samples 
+    %save TestData\FieldDraws\Temperature_ARRAY_vNM Temperature_ARRAY
+    %save TestData\FieldDraws\BlockAve_MCMC_Samples_vNM BlockAve_MCMC_Samples
+    %save TestData\FieldDraws\BlockAve_Central_MCMC_Samples_vNM BlockAve_Central_MCMC_Samples
+    %and back
+    %cd ..
+    %cd ..
+    timertimer=toc;
+    disp(['Finished MCMC iteration ', num2str(samples), ' of ', num2str(Sampler_Its), '. Last iteration took ', num2str(timertimer), ' seconds.'])
+end
+
+%% SAVE the matrix of parameter vector draws and the matrix of block
+%average vectors. 
+cd TestData
+cd FieldDraws
+    save Paramters_MCMC_Samples_vNM Paramters_MCMC_Samples 
+    save Temperature_ARRAY_vNM Temperature_ARRAY
+    save BlockAve_MCMC_Samples_vNM BlockAve_MCMC_Samples
+    save BlockAve_Central_MCMC_Samples_vNM BlockAve_Central_MCMC_Samples
+%and back
+cd ..
+cd ..
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_wtsgaus.m b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_wtsgaus.m
new file mode 100644
index 0000000..f8943f2
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testMATLAB_wtsgaus.m
@@ -0,0 +1,52 @@
+function b=wtsgaus(p,N)
+% wtsgaus: weights for gaussian filter with specified frequency response 
+% b=wtsgaus(p,N);
+% Last revised 2003-3-14
+%
+% Weights for gaussian filter with specified frequency response 
+% Specify te wavelength for the 0.50 respons, and the length of series, get
+% the coefficients, or weights
+%
+%*** INPUT
+%
+% p (1 x 1)i  period (years) at which filter is to have amp frequency response of 0.5
+% N (1 x 1)i  length of the time series (number of observations)
+%
+%*** OUTPUT
+%
+% b (1 x n)r  computed weights
+% 
+%
+%*** REFERENCES
+% 
+% WMO 1966, p. 47
+%
+%*** UW FUNCTIONS CALLED -- NONE
+%*** TOOLBOXES NEEDED -- stats
+%
+%*** NOTES
+%
+% Amplitude of frequency response drops to 0.50 at a wavelength of 
+% about 6 standard deviations of the appropriate guassian curve
+%
+% N is used as an input to restict the possible filter size (number of weights) to no larger than the sample length
+
+if p>N; 
+    error(['Desired 50% period ' num2str(p) ' is greater than  the sample length ' int2str(N)]);
+end;
+
+
+% Check that period of 50% response at least 5 yr
+if p<5;
+   error('Period of 50% response must be at least 5 yr');
+end;
+
+sigma=p/6;  % Gaussian curve should have this standard deviation
+
+x=-N:N;
+b=normpdf(x/sigma,0,1);
+bmax=max(b);
+bkeep = b>=0.05*bmax; % keep weights at least 5% as big as central weight
+b=b(bkeep);
+b=b/sum(b); % force weights to sum to one
+
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas
index e69de29..836dec6 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas
@@ -0,0 +1,33 @@
+data testing;
+begin=0;
+end=10;
+msg="This is row %x of %y";
+do i = begin to end by 1;
+drop msg begin end i;
+recnum=i;
+label=tranwrd(tranwrd(msg,"%x",i),"%y",end);
+output;
+end;
+run;
+
+libname out          '/home/tika/testing/sas';
+libname outxpt XPORT '/home/tika/testing/sas/testing.xpt';
+libname outv6 v6     '/home/tika/testing/sas';
+libname outxml xmlv2 '/home/tika/testing/sas';
+
+data out.testing;
+set testing;
+run;
+data outv6.testv6;
+set testing;
+run;
+data outxml.testxml;
+set testing;
+run;
+proc copy in=out out=outxpt;
+select testing;
+run;
+
+
+proc print data=testing;
+run;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas7bdat b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas7bdat
new file mode 100644
index 0000000..b36dc7e
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/resources/test-documents/testSAS.sas7bdat differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/pom.xml
similarity index 79%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/pom.xml
index 4a8c390..d1e81ba 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/pom.xml
@@ -21,26 +21,26 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-parser-apple-module</artifactId>
+    <artifactId>tika-parser-crypto-module</artifactId>
 
     <dependencies>
         <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>tika-parser-zip-commons</artifactId>
-            <version>${project.version}</version>
+            <groupId>org.bouncycastle</groupId>
+            <artifactId>bcmail-jdk15on</artifactId>
+            <version>${bouncycastle.version}</version>
         </dependency>
         <dependency>
-            <groupId>com.googlecode.plist</groupId>
-            <artifactId>dd-plist</artifactId>
-            <version>${ddplist.version}</version>
+            <groupId>org.bouncycastle</groupId>
+            <artifactId>bcprov-jdk15on</artifactId>
+            <version>${bouncycastle.version}</version>
         </dependency>
-
     </dependencies>
     <build>
         <plugins>
@@ -50,7 +50,7 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            <Automatic-Module-Name>org.apache.tika.parser.apple</Automatic-Module-Name>
+                            <Automatic-Module-Name>org.apache.tika.parser.code</Automatic-Module-Name>
                         </manifestEntries>
                     </archive>
                 </configuration>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
new file mode 100644
index 0000000..372c015
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.crypto;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.bouncycastle.cms.CMSException;
+import org.bouncycastle.cms.CMSSignedDataParser;
+import org.bouncycastle.cms.CMSTypedStream;
+import org.bouncycastle.operator.DigestCalculatorProvider;
+import org.bouncycastle.operator.OperatorCreationException;
+import org.bouncycastle.operator.jcajce.JcaDigestCalculatorProviderBuilder;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Basic parser for PKCS7 data.
+ */
+public class Pkcs7Parser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -7310531559075115044L;
+
+    private static final MediaType PKCS7_MIME =
+            MediaType.application("pkcs7-mime");
+
+    private static final MediaType PKCS7_SIGNATURE =
+            MediaType.application("pkcs7-signature");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            MediaType.set(PKCS7_MIME, PKCS7_SIGNATURE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        try {
+            DigestCalculatorProvider digestCalculatorProvider =
+                    new JcaDigestCalculatorProviderBuilder().setProvider("BC").build();
+            CMSSignedDataParser parser =
+                    new CMSSignedDataParser(digestCalculatorProvider, new CloseShieldInputStream(stream));
+            try {
+                CMSTypedStream content = parser.getSignedContent();
+                if (content == null) {
+                    throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)");
+                }
+                try (InputStream input = content.getContentStream()) {
+                    Parser delegate =
+                            context.get(Parser.class, EmptyParser.INSTANCE);
+                    delegate.parse(input, handler, metadata, context);
+                }
+            } finally {
+                parser.close();
+            }
+        } catch (OperatorCreationException e) {
+            throw new TikaException("Unable to create DigestCalculatorProvider", e);
+        } catch (CMSException e) {
+            throw new TikaException("Unable to parse pkcs7 signed data", e);
+        }
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test1.txt.tsd b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test1.txt.tsd
new file mode 100644
index 0000000..3a2febc
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test1.txt.tsd differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test2.txt.tsd b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test2.txt.tsd
new file mode 100644
index 0000000..826a0c0
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/Test2.txt.tsd differ
diff --git a/tika-parsers/src/test/resources/test-documents/testCERT.der b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testCERT.der
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testCERT.der
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testCERT.der
diff --git a/tika-parsers/src/test/resources/test-documents/testCERT.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testCERT.pem
similarity index 100%
copy from tika-parsers/src/test/resources/test-documents/testCERT.pem
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testCERT.pem
diff --git a/tika-parsers/src/test/resources/test-documents/testDSAKEY.der b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAKEY.der
similarity index 100%
copy from tika-parsers/src/test/resources/test-documents/testDSAKEY.der
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAKEY.der
diff --git a/tika-parsers/src/test/resources/test-documents/testDSAKEY.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAKEY.pem
similarity index 100%
copy from tika-parsers/src/test/resources/test-documents/testDSAKEY.pem
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAKEY.pem
diff --git a/tika-parsers/src/test/resources/test-documents/testDSAPARAMS.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAPARAMS.pem
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testDSAPARAMS.pem
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDSAPARAMS.pem
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDetached.p7s b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDetached.p7s
new file mode 100644
index 0000000..f4f8be9
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testDetached.p7s differ
diff --git a/tika-parsers/src/test/resources/test-documents/testECKEY.der b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECKEY.der
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testECKEY.der
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECKEY.der
diff --git a/tika-parsers/src/test/resources/test-documents/testECKEY.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECKEY.pem
similarity index 100%
copy from tika-parsers/src/test/resources/test-documents/testECKEY.pem
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECKEY.pem
diff --git a/tika-parsers/src/test/resources/test-documents/testECPARAMS.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECPARAMS.pem
similarity index 100%
copy from tika-parsers/src/test/resources/test-documents/testECPARAMS.pem
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testECPARAMS.pem
diff --git a/tika-parsers/src/test/resources/test-documents/testRSAKEY.der b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testRSAKEY.der
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testRSAKEY.der
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testRSAKEY.der
diff --git a/tika-parsers/src/test/resources/test-documents/testRSAKEY.pem b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testRSAKEY.pem
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testRSAKEY.pem
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testRSAKEY.pem
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testTSD_broken_pdf.tsd b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testTSD_broken_pdf.tsd
new file mode 100644
index 0000000..9fe32a1
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/test/resources/test-documents/testTSD_broken_pdf.tsd differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-mail-commons/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/pom.xml
similarity index 66%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-mail-commons/pom.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/pom.xml
index 62e4e5d..2194744 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-mail-commons/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/pom.xml
@@ -21,26 +21,32 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
-
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-parser-mail-commons</artifactId>
-
+    <artifactId>tika-parser-digest-commons</artifactId>
 
     <dependencies>
         <dependency>
-            <groupId>org.apache.james</groupId>
-            <artifactId>apache-mime4j-core</artifactId>
-            <version>${mime4j.version}</version>
+            <groupId>commons-codec</groupId>
+            <artifactId>commons-codec</artifactId>
+            <version>${commons.codec.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.bouncycastle</groupId>
+            <artifactId>bcmail-jdk15on</artifactId>
+            <version>${bouncycastle.version}</version>
         </dependency>
         <dependency>
-            <groupId>org.apache.james</groupId>
-            <artifactId>apache-mime4j-dom</artifactId>
-            <version>${mime4j.version}</version>
+            <groupId>org.bouncycastle</groupId>
+            <artifactId>bcprov-jdk15on</artifactId>
+            <version>${bouncycastle.version}</version>
         </dependency>
+
     </dependencies>
+
 </project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
new file mode 100644
index 0000000..c32e0ae
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.digestutils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.digest.CompositeDigester;
+import org.apache.tika.parser.digest.InputStreamDigester;
+
+/**
+ * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
+ * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
+ * <p>
+ * This digester tries to use the regular mark/reset protocol on the InputStream.
+ * However, this wraps an internal BoundedInputStream, and if the InputStream
+ * is not fully read, then this will reset the stream and
+ * spool the InputStream to disk (via TikaInputStream) and then digest the file.
+ */
+public class CommonsDigester extends CompositeDigester {
+
+    public enum DigestAlgorithm {
+        //those currently available in commons.digest
+        MD2("MD2"),
+        MD5("MD5"),
+        SHA1("SHA-1"),
+        SHA256("SHA-256"),
+        SHA384("SHA-384"),
+        SHA512("SHA-512");
+
+        private final String javaName;
+
+        DigestAlgorithm(String javaName) {
+            this.javaName = javaName;
+        }
+
+        String getJavaName() {
+            return javaName;
+        }
+        String getMetadataKey() {
+            return TikaCoreProperties.TIKA_META_PREFIX +
+                    "digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + this.toString();
+        }
+    }
+
+    /**
+     * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1".
+     * If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. "md5,sha1:32"
+     * <p/>
+     * Will throw an IllegalArgumentException if an algorithm isn't supported
+     * @param markLimit
+     * @param algorithmString
+     */
+    public CommonsDigester(int markLimit, String algorithmString) {
+        super(buildDigesters(markLimit, algorithmString));
+    }
+
+    /**
+     *
+     * @param markLimit limit for mark/reset; after this limit is hit, the
+     *                  stream is reset and spooled to disk
+     * @param algorithms algorithms to run
+     * @deprecated use {@link #CommonsDigester(int, String)}
+     */
+    public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+        super(buildDigesters(markLimit, algorithms));
+    }
+
+    private static DigestingParser.Digester[] buildDigesters(int markLimit, DigestAlgorithm[] algorithms) {
+        DigestingParser.Digester[] digesters = new DigestingParser.Digester[algorithms.length];
+        int i = 0;
+        for (DigestAlgorithm algorithm : algorithms) {
+            digesters[i++] = new InputStreamDigester(markLimit, algorithm.getJavaName(), algorithm.name(),
+                    new HexEncoder());
+        }
+        return digesters;
+    }
+
+    /**
+     * This returns digest algorithms only.  It does not understand the encoding
+     * syntax, e.g. "MD5:32" (base 32 encoding of MD5).  To parse
+     * those, see {@link #CommonsDigester(int, String)}.
+     *
+     * @deprecated use the {@link #CommonsDigester(int, String)} instead
+     * @param s comma-delimited (no space) list of algorithms to use: md5,sha256.
+     * @return
+     *
+     */
+    @Deprecated
+    public static DigestAlgorithm[] parse(String s) {
+        assert (s != null);
+
+        List<DigestAlgorithm> ret = new ArrayList<>();
+        for (String algoString : s.split(",")) {
+            ret.add(getDigestAlgorithm(algoString));
+        }
+        return ret.toArray(new DigestAlgorithm[0]);
+    }
+
+    private static DigestAlgorithm getDigestAlgorithm(String algoString) {
+        String uc = algoString.toUpperCase(Locale.ROOT);
+        if (uc.equals(DigestAlgorithm.MD2.toString())) {
+            return DigestAlgorithm.MD2;
+        } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+            return DigestAlgorithm.MD5;
+        } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+            return DigestAlgorithm.SHA1;
+        } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+            return DigestAlgorithm.SHA256;
+        } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+            return DigestAlgorithm.SHA384;
+        } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+            return DigestAlgorithm.SHA512;
+        } else {
+            StringBuilder sb = new StringBuilder();
+            int i = 0;
+            for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+                if (i++ > 0) {
+                    sb.append(", ");
+                }
+                sb.append(algo.toString());
+            }
+            throw new IllegalArgumentException("Couldn't match " + algoString + " with any of: " + sb.toString());
+        }
+    }
+
+    private static DigestingParser.Digester[] buildDigesters(int markLimit, String digesterDef) {
+        String[] digests = digesterDef.split(",");
+        DigestingParser.Digester[] digesters = new DigestingParser.Digester[digests.length];
+        int i = 0;
+        for (String digest : digests) {
+            String[] parts = digest.split(":");
+            DigestingParser.Encoder encoder = null;
+            if (parts.length > 1) {
+                if (parts[1].equals("16")) {
+                    encoder = new HexEncoder();
+                } else if (parts[1].equals("32")) {
+                    encoder = new Base32Encoder();
+                } else {
+                    throw new IllegalArgumentException("Value must be '16' or '32'");
+                }
+            } else {
+                encoder = new HexEncoder();
+            }
+            DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]);
+            digesters[i++] = new InputStreamDigester(markLimit, digestAlgorithm.getJavaName(),
+                    digestAlgorithm.name(), encoder);
+        }
+        return digesters;
+    }
+
+
+    private static class HexEncoder implements DigestingParser.Encoder {
+        @Override
+        public String encode(byte[] bytes) {
+            return Hex.encodeHexString(bytes);
+        }
+    }
+
+    private static class Base32Encoder implements DigestingParser.Encoder {
+        @Override
+        public String encode(byte[] bytes) {
+            return new Base32().encodeToString(bytes);
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-xmp-commons/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/pom.xml
similarity index 87%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-xmp-commons/pom.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/pom.xml
index 07442b6..cbda75b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-xmp-commons/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/pom.xml
@@ -21,22 +21,24 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
-    <artifactId>tika-parser-xmp-commons</artifactId>
+    <artifactId>tika-parser-font-module</artifactId>
 
 
     <dependencies>
         <dependency>
             <groupId>org.apache.pdfbox</groupId>
-            <artifactId>jempbox</artifactId>
-            <version>${jempbox.version}</version>
+            <artifactId>fontbox</artifactId>
+            <version>${pdfbox.version}</version>
         </dependency>
     </dependencies>
+
     <build>
         <plugins>
             <plugin>
@@ -45,7 +47,7 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            <Automatic-Module-Name>org.apache.tika.parser.xmp</Automatic-Module-Name>
+                            <Automatic-Module-Name>org.apache.tika.parser.font</Automatic-Module-Name>
                         </manifestEntries>
                     </archive>
                 </configuration>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
new file mode 100644
index 0000000..647ef65
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.font;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.NameRecord;
+import org.apache.fontbox.ttf.NamingTable;
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for TrueType font files (TTF).
+ */
+public class TrueTypeParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 44788554612243032L;
+
+    private static final MediaType TYPE =
+        MediaType.application("x-font-ttf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TikaInputStream tis = TikaInputStream.cast(stream);
+
+        // Ask FontBox to parse the file for us
+        TrueTypeFont font = null;
+        try {
+            TTFParser parser = new TTFParser();
+            if (tis != null && tis.hasFile()) {
+                font = parser.parse(tis.getFile());
+            } else {
+                font = parser.parse(stream);
+            }
+
+            // Report the details of the font
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            metadata.set(TikaCoreProperties.CREATED,
+                    font.getHeader().getCreated());
+            metadata.set(TikaCoreProperties.MODIFIED,
+                    font.getHeader().getModified());
+            metadata.set(AdobeFontMetricParser.MET_DOC_VERSION,
+                    Float.toString(font.getHeader().getVersion()));
+
+            // Pull out the naming info
+            NamingTable fontNaming = font.getNaming();
+            for (NameRecord nr : fontNaming.getNameRecords()) {
+                if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
+                    metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
+                }
+                if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
+                    metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
+                }
+                if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
+                    metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
+                    metadata.set(TikaCoreProperties.TITLE, nr.getString());
+                }
+                if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
+                    metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
+                }
+                if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
+                    metadata.set("Copyright", nr.getString());
+                }
+                if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
+                    metadata.set("Trademark", nr.getString());
+                }
+            }
+        } finally {
+            if (font != null) {
+                font.close();
+            }
+        }
+
+        // For now, we only output metadata, no textual contents
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..200afad
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-font-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.font.AdobeFontMetricParser
+org.apache.tika.parser.font.TrueTypeParser
+
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/pom.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/pom.xml
index 06fc83c..fa647a3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/pom.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/pom.xml
@@ -21,9 +21,10 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>tika-parser-modules</artifactId>
+        <artifactId>tika-parsers-classic-modules</artifactId>
         <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
new file mode 100644
index 0000000..19a29b9
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -0,0 +1,363 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+    private static Set<Character> ALLOWABLE_CHARS;
+
+    static {
+        ALLOWABLE_CHARS = new HashSet<>();
+        ALLOWABLE_CHARS.add(' ');
+        ALLOWABLE_CHARS.add('\n');
+        ALLOWABLE_CHARS.add('\r');
+    }
+
+    /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[]{'\n'};
+    private ContentHandler delegate;
+    private BoilerpipeExtractor extractor;
+    private boolean includeMarkup;
+    private boolean inHeader;
+    private boolean inFooter;
+    private int headerCharOffset;
+    private List<RecordedElement> elements;
+    private TextDocument td;
+    /**
+     * Creates a new boilerpipe-based content extractor, using the
+     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+     *
+     * @param delegate The {@link ContentHandler} object
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate) {
+        this(delegate, DefaultExtractor.INSTANCE);
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public BoilerpipeContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a new boilerpipe-based content extractor, using the given
+     * extraction rules. The extracted main content will be passed to the
+     * <delegate> content handler.
+     *
+     * @param delegate  The {@link ContentHandler} object
+     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+        this.td = null;
+        this.delegate = delegate;
+        this.extractor = extractor;
+    }
+
+    public boolean isIncludeMarkup() {
+        return includeMarkup;
+    }
+
+    public void setIncludeMarkup(boolean includeMarkup) {
+        this.includeMarkup = includeMarkup;
+    }
+
+    /**
+     * Retrieves the built TextDocument
+     *
+     * @return TextDocument
+     */
+    public TextDocument getTextDocument() {
+        return td;
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+
+        delegate.startDocument();
+
+        inHeader = true;
+        inFooter = false;
+        headerCharOffset = 0;
+
+        if (includeMarkup) {
+            elements = new ArrayList<>();
+        }
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        super.startPrefixMapping(prefix, uri);
+        delegate.startPrefixMapping(prefix, uri);
+    }
+
+    ;
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+
+        if (inHeader) {
+            delegate.startElement(uri, localName, qName, atts);
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            elements.add(new RecordedElement(uri, localName, qName, atts));
+        } else {
+            // This happens for the <body> element, if we're not doing markup.
+            delegate.startElement(uri, localName, qName, atts);
+        }
+    }
+
+    ;
+
+    @Override
+    public void characters(char[] chars, int offset, int length) throws SAXException {
+        super.characters(chars, offset, length);
+
+        if (inHeader) {
+            delegate.characters(chars, offset, length);
+            headerCharOffset++;
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            RecordedElement element = elements.get(elements.size() - 1);
+
+            char[] characters = new char[length];
+            System.arraycopy(chars, offset, characters, 0, length);
+            element.getCharacters().add(characters);
+        }
+    }
+
+    ;
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        super.endElement(uri, localName, qName);
+
+        if (inHeader) {
+            delegate.endElement(uri, localName, qName);
+            inHeader = !localName.equals("head");
+        } else if (inFooter) {
+            // Do nothing
+        } else if (localName.equals("body")) {
+            inFooter = true;
+        } else if (includeMarkup) {
+            // Add the end element, and the continuation from the previous element
+            elements.add(new RecordedElement(uri, localName, qName));
+            elements.add(new RecordedElement());
+        }
+    }
+
+    ;
+
+    @Override
+    public void endDocument() throws SAXException {
+        super.endDocument();
+
+        td = toTextDocument();
+        try {
+            extractor.process(td);
+        } catch (BoilerpipeProcessingException e) {
+            throw new SAXException(e);
+        }
+
+        Attributes emptyAttrs = new AttributesImpl();
+
+        // At this point we have all the information we need to either emit N paragraphs
+        // of plain text (if not including markup), or we have to replay our recorded elements
+        // and only emit character runs that passed the boilerpipe filters.
+        if (includeMarkup) {
+            BitSet validCharacterRuns = new BitSet();
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    BitSet bs = block.getContainedTextElements();
+                    if (bs != null) {
+                        validCharacterRuns.or(bs);
+                    }
+                }
+            }
+
+            // Now have bits set for all valid character runs. Replay our recorded elements,
+            // but only emit character runs flagged as valid.
+            int curCharsIndex = headerCharOffset;
+
+            for (RecordedElement element : elements) {
+                switch (element.getElementType()) {
+                    case START:
+                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+                        // Fall through
+
+                    case CONTINUE:
+                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+                        // we have to follow suit.
+                        for (int i = 0; i < element.getCharacters().size(); i++) {
+                            char[] chars = element.getCharacters().get(i);
+                            curCharsIndex++;
+                            boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);
+
+                            // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
+                            // Allow exempted characters to be written
+                            if (isValidCharacterRun ||
+                                    (chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) {
+                                delegate.characters(chars, 0, chars.length);
+                            }
+
+                            // https://issues.apache.org/jira/browse/TIKA-961
+                            if (isValidCharacterRun && i == element.getCharacters().size() - 1
+                                    && !Character.isWhitespace(chars[chars.length - 1])) {
+                                // Only add whitespace for certain elements
+                                if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                                }
+                            }
+                        }
+                        break;
+
+                    case END:
+                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+                        break;
+
+                    default:
+                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
+                }
+
+
+            }
+        } else {
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+                    char[] chars = block.getText().toCharArray();
+                    delegate.characters(chars, 0, chars.length);
+                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                }
+            }
+        }
+
+        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+        // We defer ending any prefix mapping until here, which is why we don't pass this
+        // through to the delegate in an overridden method.
+        delegate.endPrefixMapping("");
+
+        delegate.endDocument();
+    }
+
+    ;
+
+    private static class RecordedElement {
+        private String uri;
+        private String localName;
+        private String qName;
+        private Attributes attrs;
+        private List<char[]> characters;
+        private ElementType elementType;
+        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+            this(uri, localName, qName, attrs, ElementType.START);
+        }
+
+        public RecordedElement(String uri, String localName, String qName) {
+            this(uri, localName, qName, null, ElementType.END);
+        }
+
+        public RecordedElement() {
+            this(null, null, null, null, ElementType.CONTINUE);
+        }
+
+        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+            this.uri = uri;
+            this.localName = localName;
+            this.qName = qName;
+            this.attrs = attrs;
+            this.elementType = elementType;
+            this.characters = new ArrayList<char[]>();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+        }
+
+        public String getUri() {
+            return uri;
+        }
+
+        public String getLocalName() {
+            return localName;
+        }
+
+        public String getQName() {
+            return qName;
+        }
+
+        public Attributes getAttrs() {
+            return attrs;
+        }
+
+        public List<char[]> getCharacters() {
+            return characters;
+        }
+
+        public RecordedElement.ElementType getElementType() {
+            return elementType;
+        }
+
+        public enum ElementType {
+            START,
+            END,
+            CONTINUE
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java
new file mode 100644
index 0000000..fa9ba85
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+import org.apache.tika.mime.MediaType;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Objects;
+
+public class DataURIScheme {
+
+
+    private final String rawMediaTypeString;
+    private final boolean isBase64;
+    private final byte[] data;
+
+    DataURIScheme(String mediaTypeString, boolean isBase64, byte[] data) {
+        this.rawMediaTypeString = mediaTypeString;
+        this.isBase64 = isBase64;
+        this.data = data;
+    }
+
+    public InputStream getInputStream() {
+        return new ByteArrayInputStream(data);
+    }
+
+    /**
+     *
+     * @return parsed media type or <code>null</code> if parse fails or if media type string was
+     * not specified
+     */
+    public MediaType getMediaType() {
+        if (rawMediaTypeString != null) {
+            return MediaType.parse(rawMediaTypeString);
+        }
+        return null;
+    }
+
+    public boolean isBase64() {
+        return isBase64;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof DataURIScheme)) return false;
+        DataURIScheme that = (DataURIScheme) o;
+        return isBase64() == that.isBase64() &&
+                Objects.equals(rawMediaTypeString, that.rawMediaTypeString) &&
+                Arrays.equals(data, that.data);
+    }
+
+    @Override
+    public int hashCode() {
+
+        int result = Objects.hash(rawMediaTypeString, isBase64());
+        result = 31 * result + Arrays.hashCode(data);
+        return result;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java
new file mode 100644
index 0000000..6193e8e
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.mime.MediaType;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Not thread safe.  Create a separate util for each thread.
+ */
+public class DataURISchemeUtil {
+
+    public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII";
+
+    private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$");
+    private static Pattern EXTRACT_PATTERN =
+            Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']");
+    private final Matcher parseMatcher = PARSE_PATTERN.matcher("");
+    private final Matcher extractMatcher = EXTRACT_PATTERN.matcher("");
+    Base64 base64 = new Base64();
+
+    public DataURIScheme parse(String string) throws DataURISchemeParseException {
+        parseMatcher.reset(string);
+        if (parseMatcher.find()) {
+            return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3));
+        }
+        throw new DataURISchemeParseException("Couldn't find expected pattern");
+    }
+
+    private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) {
+        byte[] data = null;
+        //strip out back slashes as you might have in css
+        dataString = (dataString != null) ?
+                dataString.replaceAll("\\\\", " ") : dataString;
+
+        if (dataString == null || dataString.length() == 0) {
+            data = new byte[0];
+        } else if (isBase64 != null) {
+            data = base64.decode(dataString);
+        } else {
+            //TODO: handle encodings
+            MediaType mediaType = MediaType.parse(mediaTypeString);
+            Charset charset = StandardCharsets.UTF_8;
+            if (mediaType.hasParameters()) {
+                String charsetName = mediaType.getParameters().get("charset");
+                if (charsetName != null && Charset.isSupported(charsetName)) {
+                    try {
+                        charset = Charset.forName(charsetName);
+                    } catch (IllegalCharsetNameException e) {
+                        //swallow and default to UTF-8
+                    }
+                }
+            }
+            data = dataString.getBytes(charset);
+        }
+        return new DataURIScheme(mediaTypeString, (isBase64 != null), data);
+    }
+
+    /**
+     * Extracts DataURISchemes from free text, as in javascript.
+     *
+     * @param string
+     * @return list of extracted DataURISchemes
+     */
+    public List<DataURIScheme> extract(String string) {
+        extractMatcher.reset(string);
+        List<DataURIScheme> list = null;
+        while (extractMatcher.find()) {
+            DataURIScheme dataURIScheme = build(extractMatcher.group(1),
+                    extractMatcher.group(2), extractMatcher.group(3));
+            if (list == null) {
+                list = new ArrayList<>();
+            }
+            list.add(dataURIScheme);
+        }
+        return (list == null) ? Collections.EMPTY_LIST : list;
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
new file mode 100644
index 0000000..c86ba7e
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Character encoding detector for determining the character encoding of a
+ * HTML document based on the potential charset parameter found in a
+ * Content-Type http-equiv meta tag somewhere near the beginning. Especially
+ * useful for determining the type among multiple closely related encodings
+ * (ISO-8859-*) for which other types of encoding detection are unreliable.
+ *
+ * @since Apache Tika 1.2
+ */
+public class HtmlEncodingDetector implements EncodingDetector {
+
+    /**
+     * HTML can include non-iana supported charsets that Java
+     * recognizes, e.g. "unicode".  This can lead to incorrect detection/mojibake.
+     * Ignore charsets in html meta-headers that are not supported by IANA.
+     * See: TIKA-2592
+     */
+    private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
+    static {
+        Set<String> unsupported = new HashSet<>();
+        try (BufferedReader reader =
+                     new BufferedReader(
+                             new InputStreamReader(
+                                     HtmlEncodingDetector.class
+                                        .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"),
+                                     StandardCharsets.UTF_8))) {
+            String line = reader.readLine();
+            while (line != null) {
+                if (line.startsWith("#")) {
+                    line = reader.readLine();
+                    continue;
+                }
+                line = line.trim();
+                if (line.length() > 0) {
+                    unsupported.add(line.toLowerCase(Locale.US));
+                }
+                line = reader.readLine();
+            }
+        } catch (IOException e) {
+            throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
+        }
+        CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
+    }
+    // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
+    private static final int DEFAULT_MARK_LIMIT = 8192;
+
+
+    private static final Pattern HTTP_META_PATTERN = Pattern.compile(
+            "(?is)<\\s*meta(?:/|\\s+)([^<>]+)"
+    );
+
+    //this should match both the older:
+    //<meta http-equiv="content-type" content="text/html; charset=xyz"/>
+    //and 
+    //html5 <meta charset="xyz">
+    //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm
+    //for the noisiness that one might encounter in charset attrs.
+    //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings
+    //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html
+    //For a more general "not" matcher, try:
+    //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
+    private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
+            ("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+    );
+
+    private static final Charset ASCII = Charset.forName("US-ASCII");
+
+    @Field
+    private int markLimit = DEFAULT_MARK_LIMIT;
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        // Read enough of the text stream to capture possible meta tags
+        input.mark(markLimit);
+        byte[] buffer = new byte[markLimit];
+        int n = 0;
+        int m = input.read(buffer);
+        while (m != -1 && n < buffer.length) {
+            n += m;
+            m = input.read(buffer, n, buffer.length - n);
+        }
+        input.reset();
+
+        // Interpret the head as ASCII and try to spot a meta tag with
+        // a possible character encoding hint
+
+        String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
+        //strip out comments
+        String headNoComments = head.replaceAll("<!--.*?(-->|$)", " ");
+        //try to find the encoding in head without comments
+        Charset charset = findCharset(headNoComments);
+        //if nothing is found, back off to find any encoding
+        if (charset == null) {
+            return findCharset(head);
+        }
+        return charset;
+
+    }
+
+    //returns null if no charset was found
+    private Charset findCharset(String s) {
+
+        Matcher equiv = HTTP_META_PATTERN.matcher(s);
+        Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
+        //iterate through meta tags
+        while (equiv.find()) {
+            String attrs = equiv.group(1);
+            charsetMatcher.reset(attrs);
+            //iterate through charset= and return the first match
+            //that is valid
+            while (charsetMatcher.find()) {
+                String candCharset = charsetMatcher.group(1);
+                if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
+                    continue;
+                }
+                if ("x-user-defined".equalsIgnoreCase(candCharset)) {
+                    candCharset = "windows-1252";
+                }
+
+                if (CharsetUtils.isSupported(candCharset)) {
+                    try {
+                        return CharsetUtils.forName(candCharset);
+                    } catch (IllegalArgumentException e) {
+                        //ignore
+                    }
+                }
+            }
+        }
+        return null;
+    }
+
+    /**
+     * How far into the stream to read for charset detection.
+     * Default is 8192.
+     *
+     * @param markLimit
+     */
+    @Field
+    public void setMarkLimit(int markLimit) {
+        this.markLimit = markLimit;
+    }
+
+    public int getMarkLimit() {
+        return markLimit;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
new file mode 100644
index 0000000..38283a8
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.HTML;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+    // List of attributes that need to be resolved.
+    private static final Set<String> URI_ATTRIBUTES =
+            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+    private static final Pattern ICBM =
+            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+    private static final Attributes EMPTY_ATTS = new AttributesImpl();
+    private final HtmlMapper mapper;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+    private final ParseContext context;
+    private final boolean extractScripts;
+    private final StringBuilder title = new StringBuilder();
+    private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
+    private int bodyLevel = 0;
+    private int discardLevel = 0;
+    private int titleLevel = 0;
+    private int scriptLevel= 0;
+    private Attributes scriptAtts = EMPTY_ATTS;//attributes from outermost script element
+    private final StringBuilder script = new StringBuilder();
+
+    private boolean isTitleSetToMetadata = false;
+
+    private HtmlHandler(
+            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata,
+            ParseContext context, boolean extractScripts) {
+        super(xhtml);
+        this.mapper = mapper;
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+        this.context = context;
+        this.extractScripts = extractScripts;
+        // Try to determine the default base URL, if one has not been given
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = name.trim();
+                try {
+                    new URL(name); // test URL format
+                    metadata.set(Metadata.CONTENT_LOCATION, name);
+                } catch (MalformedURLException e) {
+                    // The resource name is not a valid URL, ignore it
+                }
+            }
+        }
+    }
+
+    public HtmlHandler(
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata, ParseContext context,
+            boolean extractScripts) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata, context, extractScripts);
+    }
+
+    /**
+     * @deprecated use {@link HtmlHandler#HtmlHandler(HtmlMapper, ContentHandler, Metadata, ParseContext, boolean)}
+     * @param mapper
+     * @param handler
+     * @param metadata
+     */
+    @Deprecated
+    public HtmlHandler(
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata, new ParseContext(), false);
+    }
+
+
+    @Override
+    public void startElement(
+            String uri, String local, String name, Attributes atts)
+            throws SAXException {
+
+        if ("HTML".equals(name) && atts.getValue("lang") != null) {
+            metadata.set(Metadata.CONTENT_LANGUAGE, atts.getValue("lang"));
+        }
+        if ("SCRIPT".equals(name)) {
+            scriptLevel++;
+        }
+        if ("TITLE".equals(name) || titleLevel > 0) {
+            titleLevel++;
+        }
+        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+            bodyLevel++;
+        }
+        if (mapper.isDiscardElement(name) || discardLevel > 0) {
+            discardLevel++;
+        }
+
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("META".equals(name) && atts.getValue("content") != null) {
+                // TIKA-478: For cases where we have either a name or
+                // "http-equiv", assume that XHTMLContentHandler will emit
+                // these in the <head>, thus passing them through safely.
+                if (atts.getValue("http-equiv") != null) {
+                    addHtmlMetadata(
+                            atts.getValue("http-equiv"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
+                    addHtmlMetadata(
+                            atts.getValue("name"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("property") != null) {
+                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+                    metadata.add(
+                            atts.getValue("property"),
+                            atts.getValue("content"));
+                }
+            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+                startElementWithSafeAttributes("base", atts);
+                xhtml.endElement("base");
+                metadata.set(
+                        Metadata.CONTENT_LOCATION,
+                        resolve(atts.getValue("href")));
+            } else if ("LINK".equals(name)) {
+                startElementWithSafeAttributes("link", atts);
+                xhtml.endElement("link");
+            } else if ("SCRIPT".equals(name)) {
+                scriptAtts = atts;
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                startElementWithSafeAttributes(safe, atts);
+            }
+        }
+
+        title.setLength(0);
+        String value = atts.getValue("src");
+        if (value != null && value.startsWith("data:")) {
+            //don't extract data if we're in a script
+            //and the user doesn't want to extract scripts
+            if ( scriptLevel == 0 || extractScripts) {
+                handleDataURIScheme(value);
+            }
+        }
+    }
+
+    /**
+     * Adds a metadata setting from the HTML <head/> to the Tika metadata
+     * object. The name and value are normalized where possible.
+     */
+    private void addHtmlMetadata(String name, String value) {
+        if (name == null || value == null) {
+            // ignore
+        } else if (name.equalsIgnoreCase("ICBM")) {
+            Matcher m = ICBM.matcher(value);
+            if (m.matches()) {
+                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+                metadata.set(Metadata.LATITUDE, m.group(1));
+                metadata.set(Metadata.LONGITUDE, m.group(2));
+            } else {
+                metadata.set("ICBM", value);
+            }
+        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            //don't overwrite Metadata.CONTENT_TYPE!
+            MediaType type = MediaType.parse(value);
+            if (type != null) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+            } else {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+            }
+        } else {
+            metadata.add(name, value);
+        }
+    }
+
+    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+        if (atts.getLength() == 0) {
+            xhtml.startElement(name);
+            return;
+        }
+
+        boolean isObject = name.equals("object");
+        String codebase = null;
+        if (isObject) {
+            codebase = atts.getValue("", "codebase");
+            if (codebase != null) {
+                codebase = resolve(codebase);
+            } else {
+                codebase = metadata.get(Metadata.CONTENT_LOCATION);
+            }
+        }
+
+        AttributesImpl newAttributes = new AttributesImpl(atts);
+        for (int att = 0; att < newAttributes.getLength(); att++) {
+            String attrName = newAttributes.getLocalName(att);
+            String normAttrName = mapper.mapSafeAttribute(name, attrName);
+            if (normAttrName == null) {
+                newAttributes.removeAttribute(att);
+                att--;
+            } else {
+                // We have a remapped attribute name, so set it as it might have changed.
+                newAttributes.setLocalName(att, normAttrName);
+
+                // And resolve relative links. Eventually this should be pushed
+                // into the HtmlMapper code.
+                if (URI_ATTRIBUTES.contains(normAttrName)) {
+                    //if this is a src="data: " element,
+                    //we've handled that as an embedded file, don't include the full thing
+                    //here
+                    if (normAttrName.equals("src")) {
+                        String v = newAttributes.getValue(att);
+                        if (v.startsWith("data:")) {
+                            newAttributes.setValue(att, "data:");
+                        }
+                    }
+                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+                } else if (isObject && "codebase".equals(normAttrName)) {
+                    newAttributes.setValue(att, codebase);
+                } else if (isObject
+                        && ("data".equals(normAttrName)
+                        || "classid".equals(normAttrName))) {
+                    newAttributes.setValue(
+                            att,
+                            resolve(codebase, newAttributes.getValue(att)));
+                }
+            }
+        }
+
+        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+        }
+
+        xhtml.startElement(name, newAttributes);
+    }
+
+    @Override
+    public void endElement(
+            String uri, String local, String name) throws SAXException {
+        if ("SCRIPT".equals(name)) {
+            scriptLevel--;
+            if (scriptLevel == 0) {
+                if (scriptAtts.getLength() > 0) {
+                    startElementWithSafeAttributes("script", scriptAtts);
+                    xhtml.endElement("script");
+                }
+                scriptAtts = EMPTY_ATTS;
+                if (extractScripts) {
+                    writeScript();
+                }
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                xhtml.endElement(safe);
+            } else if (XHTMLContentHandler.ENDLINE.contains(
+                    name.toLowerCase(Locale.ENGLISH))) {
+                // TIKA-343: Replace closing block tags (and <br/>) with a
+                // newline unless the HtmlMapper above has already mapped
+                // them to something else
+                xhtml.newline();
+            }
+        }
+
+        if (titleLevel > 0) {
+            titleLevel--;
+            if (titleLevel == 0 && !isTitleSetToMetadata) {
+                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+                isTitleSetToMetadata = true;
+            }
+        }
+        if (bodyLevel > 0) {
+            bodyLevel--;
+        }
+        if (discardLevel > 0) {
+            discardLevel--;
+        }
+    }
+
+    private void handleDataURIScheme(String string) throws SAXException {
+        DataURIScheme dataURIScheme = null;
+        try {
+            dataURIScheme = dataURISchemeUtil.parse(string);
+        } catch (DataURISchemeParseException e) {
+            //swallow
+            return;
+        }
+
+        //do anything with attrs?
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+        if (dataURIScheme.getMediaType() != null) {
+            m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString());
+        }
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+            try (InputStream stream = dataURIScheme.getInputStream()) {
+                embeddedDocumentExtractor.parseEmbedded(
+                        stream, xhtml, m, false
+                );
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+            }
+        }
+    }
+
+    private void writeScript() throws SAXException {
+        //don't write an attached macro if there is no content
+        //we may want to revisit this behavior
+        if (script.toString().trim().length() == 0) {
+            return;
+        }
+        //do anything with attrs?
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+        String src = scriptAtts.getValue("src");
+        if (src != null) {
+            m.set(HTML.SCRIPT_SOURCE, src);
+        }
+
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        //try to scrape dataURISchemes from javascript
+        List<DataURIScheme> dataURISchemes = dataURISchemeUtil.extract(script.toString());
+        for (DataURIScheme dataURIScheme : dataURISchemes) {
+            Metadata dataUriMetadata = new Metadata();
+            dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                    TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+            dataUriMetadata.set(Metadata.CONTENT_TYPE,
+                    dataURIScheme.getMediaType().toString());
+            if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
+                try (InputStream dataURISchemeInputStream = dataURIScheme.getInputStream()) {
+                    embeddedDocumentExtractor.parseEmbedded(dataURISchemeInputStream,
+                            xhtml, dataUriMetadata, false);
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+        }
+
+        try (InputStream stream = new ByteArrayInputStream(
+                script.toString().getBytes(StandardCharsets.UTF_8))) {
+            embeddedDocumentExtractor.parseEmbedded(
+                    stream, xhtml, m, false
+            );
+        } catch (IOException e) {
+            //shouldn't ever happen
+        } finally {
+            script.setLength(0);
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (scriptLevel > 0 && extractScripts) {
+            script.append(ch, start, length);
+        }
+        if (titleLevel > 0 && bodyLevel == 0) {
+            title.append(ch, start, length);
+        }
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.characters(ch, start, length);
+        }
+
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.ignorableWhitespace(ch, start, length);
+        }
+    }
+
+    private String resolve(String url) {
+        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+    }
+
+    private String resolve(String base, String url) {
+        url = url.trim();
+
+        // Return the URL as-is if no base URL is available or if the URL
+        // matches a common non-hierarchical or pseudo URI prefix
+        String lower = url.toLowerCase(Locale.ENGLISH);
+        if (base == null
+                || lower.startsWith("urn:")
+                || lower.startsWith("mailto:")
+                || lower.startsWith("tel:")
+                || lower.startsWith("data:")
+                || lower.startsWith("javascript:")
+                || lower.startsWith("about:")) {
+            return url;
+        }
+
+        try {
+            URL baseURL = new URL(base.trim());
+
+            // We need to handle one special case, where the relativeUrl is
+            // just a query string (like "?pid=1"), and the baseUrl doesn't
+            // end with a '/'. In that case, the URL class removes the last
+            // portion of the path, which we don't want.
+            String path = baseURL.getPath();
+            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+                return new URL(
+                        baseURL.getProtocol(),
+                        baseURL.getHost(), baseURL.getPort(),
+                        baseURL.getPath() + url).toExternalForm();
+            } else {
+                return new URL(baseURL, url).toExternalForm();
+            }
+        } catch (MalformedURLException e) {
+            // Unknown or broken format; just return the URL as received.
+            return url;
+        }
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
new file mode 100644
index 0000000..1ca7434
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+
+
+    /**
+     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+     * given attribute is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the attribute
+     * will be ignored. This method assumes that the element name
+     * is valid and normalised.
+     *
+     * @param elementName   HTML element name (lower case)
+     * @param attributeName HTML attribute name (lower case)
+     * @return XHTML attribute name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeAttribute(String elementName, String attributeName);
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
new file mode 100644
index 0000000..adf591a
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractEncodingDetectorParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.swing.text.AbstractDocument;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractEncodingDetectorParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7895315240498733128L;
+
+    private static final Logger LOG = LoggerFactory.getLogger(HtmlParser.class);
+
+    private static final MediaType XHTML = MediaType.application("xhtml+xml");
+    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+    private static final MediaType X_ASP = MediaType.application("x-asp");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.text("html"),
+                    XHTML,
+                    WAP_XHTML,
+                    X_ASP)));
+
+    /**
+     * HTML schema singleton used to amortise the heavy instantiation time.
+     */
+    private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+    @Field
+    private boolean extractScripts = false;
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public HtmlParser() {
+        super();
+    }
+
+    public HtmlParser(EncodingDetector encodingDetector) {
+        super(encodingDetector);
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        TemporaryResources tmp = null;
+        try {
+            if (!TikaInputStream.isTikaInputStream(stream)) {
+                tmp = new TemporaryResources();
+                stream = TikaInputStream.get(stream, tmp);
+            }
+            //AutoDetectReader can throw exceptions during
+            //initialization.  If we just created a
+            //TemporaryResources, we need to make sure to close it.
+            parseImpl(stream, handler, metadata, context);
+        } finally {
+            if (tmp != null) {
+                tmp.close();
+            }
+        }
+
+    }
+
+
+    private void parseImpl(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+                metadata, getEncodingDetector(context))) {
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType contentType = null;
+            if (previous == null || previous.startsWith("text/html")) {
+                contentType = new MediaType(MediaType.TEXT_HTML, charset);
+            } else if (previous.startsWith("application/xhtml+xml")) {
+                contentType = new MediaType(XHTML, charset);
+            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+                contentType = new MediaType(WAP_XHTML, charset);
+            } else if (previous.startsWith("application/x-asp")) {
+                contentType = new MediaType(X_ASP, charset);
+            }
+            if (contentType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+            }
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            // Get the HTML mapper from the parse context
+            HtmlMapper mapper =
+                    context.get(HtmlMapper.class, new HtmlParserMapper());
+
+            // Parse the HTML document
+            org.ccil.cowan.tagsoup.Parser parser =
+                    new org.ccil.cowan.tagsoup.Parser();
+
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+            // TIKA-528: Reuse share schema to avoid heavy instantiation
+            parser.setProperty(
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+            parser.setFeature(
+                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+            parser.setContentHandler(new XHTMLDowngradeHandler(
+                    new HtmlHandler(mapper, handler, metadata, context, extractScripts)));
+
+            parser.parse(reader.asInputSource());
+        }
+    }
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     * <p/>
+     * Subclasses can override this method to customize the default mapping.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected String mapSafeElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+    }
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output. Subclasses
+     * can override this method to customize the set of discarded elements.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected boolean isDiscardElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+    }
+
+    /**
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+    }
+
+    /**
+     * Adapter class that maintains backwards compatibility with the
+     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+     * directly would require those methods to be public, which would break
+     * backwards compatibility with subclasses.
+     *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This class will be removed in Tika 1.0.
+     */
+    private class HtmlParserMapper implements HtmlMapper {
+        public String mapSafeElement(String name) {
+            return HtmlParser.this.mapSafeElement(name);
+        }
+
+        public boolean isDiscardElement(String name) {
+            return HtmlParser.this.isDiscardElement(name);
+        }
+
+        public String mapSafeAttribute(String elementName, String attributeName) {
+            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+        }
+    }
+
+    /**
+     * Whether or not to extract contents in script entities.
+     * Default is <code>false</code>
+     *
+     * @param extractScripts
+     */
+    @Field
+    public void setExtractScripts(boolean extractScripts) {
+        this.extractScripts = extractScripts;
+    }
+
+    public boolean getExtractScripts() {
+        return extractScripts;
+    }
+
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
new file mode 100644
index 0000000..4d4c7c2
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
+import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Singleton class that associates standard charset names to java charset implementations
+ * https://encoding.spec.whatwg.org/#ref-for-iso-8859-8-i
+ */
+final class CharsetAliases {
+
+    private static final Map<String, Charset> charsetsByLabel = new HashMap<>();
+
+    private CharsetAliases() {
+    }
+
+    /**
+     * @param label a charset name
+     * @return the corresponding java charset, if there is one. Otherwise, null
+     */
+    static Charset getCharsetByLabel(String label) {
+        if (label == null) return null;
+        synchronized (charsetsByLabel) {
+            // Lazy initialization
+            if (charsetsByLabel.isEmpty()) addAll();
+        }
+        label = label.trim().toLowerCase(Locale.US);
+        return charsetsByLabel.get(label);
+    }
+
+    private static void addAll() {
+        addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
+        addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
+        addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
+                "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949");
+        addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
+                "gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
+        addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
+        addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
+        addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157",
+                "iso8859-10", "iso885910", "l6", "latin6");
+        addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
+        addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914");
+        addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
+                "iso_8859-15", "l9");
+        addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16");
+        addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
+                "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
+        addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
+                "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
+        addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
+                "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
+        addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144",
+                "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
+        addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
+                "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127",
+                "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
+        addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", "greek8",
+                "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek");
+        // ISO-8859-8 actually should have an influence on the layout direction
+        // (text should be decoded in the visual order). However, this is not implemented in tika.
+        addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
+                "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual");
+        addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
+        addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
+        addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u");
+        addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis",
+                "sjis", "windows-31j", "x-sjis");
+        addCharset(charset("UTF-16BE"), "utf-16be");
+        addCharset(charset("UTF-16LE"), "utf-16", "utf-16le");
+        addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
+        addCharset(charset("gb18030"), "gb18030");
+        addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
+        addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
+        addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
+                "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
+                "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
+        addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
+        addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
+                "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
+        addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
+        addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
+        addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
+        addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
+        addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+                "tis-620", "windows-874");
+        addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
+        addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
+        // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset attacks
+        addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext",
+                "iso-2022-kr", "replacement");
+        // The x-user-defined charset is not present in java
+        addCharset(new XUserDefinedCharset(), "x-user-defined");
+    }
+
+    /**
+     * @param names jvm charset names
+     * @return the first of the given charsets that exists in the current JVM,
+     * or ISO_8859_1 if none exists
+     */
+    private static Charset charset(String... names) {
+        for (String name : names) {
+            try {
+                return Charset.forName(name);
+            } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+        }
+        // The only single-byte charset extended charset that must be present on every Java platform
+        return StandardCharsets.ISO_8859_1;
+    }
+
+    /**
+     * @param charset name of the charset in the JVM
+     * @param names   standard W3C charset names
+     */
+    private static void addCharset(Charset charset, String... names) {
+        for (String name : names) {
+            charsetsByLabel.put(name, charset);
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
new file mode 100644
index 0000000..0ba3637
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.nio.charset.Charset;
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+
+/**
+ * A detection may either not find a charset, find an invalid charset, or find a valid charset
+ */
+class CharsetDetectionResult {
+    private boolean found = false;
+    private Charset charset = null;
+
+    private CharsetDetectionResult() { /* default result: not found */}
+
+    static CharsetDetectionResult notFound() {
+        return new CharsetDetectionResult();
+    }
+
+    public boolean isFound() {
+        return found;
+    }
+
+    public void find(String charsetName) {
+        this.found = true;
+        charsetName = charsetName.trim();
+        if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+        this.charset = CharsetAliases.getCharsetByLabel(charsetName);
+        // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+        if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+    }
+
+    public Charset getCharset() {
+        // the result may be null even if found is true, in the case there is a charset specified,
+        // but it is invalid
+        return charset;
+    }
+
+    public void setCharset(Charset charset) {
+        this.found = true;
+        this.charset = charset;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
new file mode 100644
index 0000000..a00aeb1
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.BitSet;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A scanner meant to detect charset meta tags in a byte stream
+ * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
+ */
+class PreScanner {
+
+    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+    private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
+    private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
+    private static final byte[] META_TAG_START = {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
+    private static final byte SLASH = (byte) '/';
+    private static final byte EQUAL = (byte) '=';
+    private static final byte TAG_START = (byte) '<';
+    private static final byte TAG_END = (byte) '>';
+    private static final BitSet QUOTE = bitSet('"', '\'');
+
+    private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
+    private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
+    private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
+    private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');
+
+    private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
+    private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
+    private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
+    private static final byte LOWER_A = (byte) 'a';
+    private static final byte LOWER_Z = (byte) 'z';
+    private static final byte UPPER_A = (byte) 'A';
+    private static final byte UPPER_Z = (byte) 'Z';
+    private BufferedInputStream stream;
+    private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();
+
+    PreScanner(InputStream inputStream) {
+        this.stream = new BufferedInputStream(inputStream);
+    }
+
+    private static BitSet bitSet(int... bs) {
+        BitSet bitSet = new BitSet(0xFF);
+        for (int b : bs) bitSet.set(b);
+        return bitSet;
+    }
+
+    private static BitSet bitSet(BitSet base, int... bs) {
+        BitSet bitSet = (BitSet) base.clone();
+        for (int b : bs) bitSet.set(b);
+        return bitSet;
+    }
+
+    static String getEncodingFromMeta(String attributeValue) {
+        Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
+        if (!matcher.find()) return null;
+        return matcher.group(2);
+    }
+
+    private static boolean contains(BitSet bitSet, byte b) {
+        return bitSet.get(b & 0xFF);
+    }
+
+    Charset scan() {
+        while (processAtLeastOneByte()) {
+            if (detectedCharset.isFound()) {
+                return detectedCharset.getCharset();
+            }
+        }
+        return null;
+    }
+
+    Charset detectBOM() {
+        try {
+            if (expect(UTF8_BOM)) return StandardCharsets.UTF_8;
+            else if (expect(UTF16_BE_BOM)) return StandardCharsets.UTF_16BE;
+            else if (expect(UTF16_LE_BOM)) return StandardCharsets.UTF_16LE;
+        } catch (IOException e) { /* stream could not be read, also return null */ }
+        return null;
+    }
+
+    private boolean processAtLeastOneByte() {
+        try {
+            return processComment() ||
+                    processMeta() ||
+                    processTag() ||
+                    processSpecialTag() ||
+                    processAny();
+        } catch (IOException e) {
+            return false;
+        }
+    }
+
+    private boolean processAny() throws IOException {
+        int read = stream.read();
+        return read != -1;
+    }
+
+    private boolean processTag() throws IOException {
+        stream.mark(3);
+        if (read() == TAG_START) {
+            int read = stream.read();
+            if (read == SLASH) read = stream.read();
+            if ((LOWER_A <= read && read <= LOWER_Z) ||
+                    (UPPER_A <= read && read <= UPPER_Z)) {
+                do stream.mark(1);
+                while (!contains(SPACE_OR_TAG_END, read()));
+                stream.reset();
+                while (getAttribute() != null) {/* ignore the attribute*/}
+                return true;
+            }
+        }
+        stream.reset();
+        return false;
+    }
+
+    private boolean processSpecialTag() throws IOException {
+        stream.mark(2);
+        if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
+            skipUntil(TAG_END);
+            return true;
+        }
+        stream.reset();
+        return false;
+    }
+
+    private boolean processMeta() throws IOException {
+        stream.mark(6); // len("<meta ") == 6
+        if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
+            MetaProcessor metaProcessor = new MetaProcessor();
+            for (Map.Entry<String, String> attribute = getAttribute(); attribute != null; attribute = getAttribute()) {
+                metaProcessor.processAttribute(attribute);
+            }
+            metaProcessor.updateDetectedCharset(detectedCharset);
+            return true;
+        }
+        stream.reset();
+        return false;
+    }
+
+    /**
+     * Read an attribute from the stream
+     *
+     * @return the attribute as a Map.Entry, where the key is the attribute's name and
+     * the value is the attribute's value. If there is no attribute, return null
+     */
+    private Map.Entry<String, String> getAttribute() throws IOException {
+        String name = getAttributeName();
+        if (name == null) return null;
+
+        if (!expect(EQUAL)) return new AbstractMap.SimpleEntry<>(name, "");
+        skipAll(WHITESPACE);
+
+        String value = getAttributeValue();
+        return new AbstractMap.SimpleEntry<>(name, value);
+    }
+
+    private String getAttributeName() throws IOException {
+        skipAll(SPACE_OR_SLASH);
+        if (expect(TAG_END)) return null;
+        StringBuilder name = new StringBuilder();
+        while (!(peek() == EQUAL && name.length() > 0) &&
+                !(peek() == TAG_END || peek() == SLASH) &&
+                !skipAll(WHITESPACE)) {
+            name.append((char) getLowerCaseChar());
+        }
+        return name.toString();
+    }
+
+    private String getAttributeValue() throws IOException {
+        StringBuilder value = new StringBuilder();
+        stream.mark(1);
+        byte quote = read();
+        if (contains(QUOTE, quote)) {
+            for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
+                value.append((char) b);
+            }
+        } else {
+            stream.reset();
+            for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); b = getLowerCaseChar()) {
+                value.append((char) b);
+                stream.mark(1);
+            }
+            stream.reset(); // unread the space or tag end
+        }
+        return value.toString();
+    }
+
+    private boolean skipAll(BitSet bitSet) throws IOException {
+        boolean skipped = false;
+        stream.mark(1);
+        for (byte read = read(); contains(bitSet, read); read = read()) {
+            skipped = true;
+            stream.mark(1);
+        }
+        stream.reset();
+        return skipped;
+    }
+
+    private byte getLowerCaseChar() throws IOException {
+        byte nextPoint = read();
+        if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+        return nextPoint;
+    }
+
+    private boolean processComment() throws IOException {
+        if (!expect(COMMENT_START)) return false;
+        if (!expect(TAG_END)) skipUntil(COMMENT_END);
+        return true;
+    }
+
+    private boolean expect(byte... expected) throws IOException {
+        stream.mark(expected.length);
+        for (byte b : expected) {
+            byte read = read();
+            if (read != b) {
+                stream.reset();
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void skipUntil(byte... expected) throws IOException {
+        while (!expect(expected)) {
+            if (stream.read() == -1) return;
+        }
+    }
+
+    private boolean readCaseInsensitive(byte... bs) throws IOException {
+        for (byte b : bs) if (getLowerCaseChar() != b) return false;
+        return true;
+    }
+
+    private byte read() throws IOException {
+        int r = stream.read();
+        if (r == -1) throw new IOException();
+        return (byte) r;
+    }
+
+    private byte peek() throws IOException {
+        stream.mark(1);
+        byte b = read();
+        stream.reset();
+        return b;
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
new file mode 100644
index 0000000..32b96cf
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector.charsets;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+
+/**
+ * An implementation of the standard "replacement" charset defined by the W3C.
+ * See: https://encoding.spec.whatwg.org/#replacement
+ */
+public class ReplacementCharset extends Charset {
+
+    public ReplacementCharset() {
+        super("replacement", null);
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+        return cs.equals(this);
+    }
+
+    public CharsetDecoder newDecoder() {
+        return new CharsetDecoder(this, Float.MIN_VALUE, 1) {
+            private boolean replacementErrorReturned = false;
+
+            @Override
+            protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+                if (in.hasRemaining() && !replacementErrorReturned) {
+                    replacementErrorReturned = true;
+                    return CoderResult.malformedForLength(in.remaining());
+                }
+                in.position(in.limit());
+                return CoderResult.UNDERFLOW;
+            }
+
+            @Override
+            protected void implReset() {
+                replacementErrorReturned = false;
+            }
+        };
+    }
+
+    public CharsetEncoder newEncoder() {
+        throw new UnsupportedOperationException("This charset does not support encoding");
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
new file mode 100644
index 0000000..2d7b38c
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.parser.html.HtmlEncodingDetector
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
new file mode 100644
index 0000000..956c6e0
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
@@ -0,0 +1,139 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+646
+737
+775
+813
+819
+858
+874
+8859_1
+8859_13
+8859_15
+8859_2
+8859_4
+8859_5
+8859_7
+8859_9
+912
+914
+915
+920
+923
+ansi-1251
+ascii
+ascii7
+cesu8
+cp1250
+cp1251
+cp1252
+cp1253
+cp1254
+cp1257
+cp5346
+cp5347
+cp5348
+cp5349
+cp5350
+cp5353
+cp737
+cp813
+cp858
+cp874
+cp912
+cp914
+cp915
+cp920
+cp923
+csibm862
+csisolatin0
+csisolatin9
+cspcp855
+default
+ibm-437
+ibm-737
+ibm-775
+ibm-813
+ibm-819
+ibm-850
+ibm-852
+ibm-855
+ibm-857
+ibm-862
+ibm-866
+ibm-874
+ibm-912
+ibm-914
+ibm-915
+ibm-920
+ibm-923
+ibm737
+ibm813
+ibm874
+ibm912
+ibm914
+ibm915
+ibm920
+ibm923
+iso8859-1
+iso8859-13
+iso8859-15
+iso8859-2
+iso8859-4
+iso8859-5
+iso8859-7
+iso8859-9
+iso8859_1
+iso8859_13
+iso8859_15
+iso8859_15_fdis
+iso8859_2
+iso8859_4
+iso8859_5
+iso8859_7
+iso8859_9
+iso_8859-13
+iso_8859_1
+koi8
+koi8_r
+koi8_u
+l9
+latin0
+latin9
+sun_eu_greek
+unicode
+unicode-1-1-utf-8
+unicodebig
+unicodebigunmarked
+unicodelittle
+unicodelittleunmarked
+utf-32be-bom
+utf-32le-bom
+utf16
+utf32
+utf8
+utf_16
+utf_16be
+utf_16le
+utf_32
+utf_32be
+utf_32be_bom
+utf_32le
+utf_32le_bom
+windows-437
+x-utf-16be
+x-utf-16le
+x-utf-32be
+x-utf-32le
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java
new file mode 100644
index 0000000..adc7f53
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.TikaTest;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class DataURISchemeParserTest extends TikaTest {
+    DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
+
+    @Test
+    public void testEmpty() throws Exception {
+        DataURIScheme dataURIScheme = dataURISchemeUtil.parse("data:,");
+        assertFalse(dataURIScheme.isBase64());
+        assertNull(dataURIScheme.getMediaType());
+        assertEquals(-1, dataURIScheme.getInputStream().read());
+    }
+
+    @Test
+    public void testNewlines() throws Exception {
+        String data = "data:image/png;base64,R0lG\nODdh";
+        DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
+        assertTrue(dataURIScheme.isBase64());
+        assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
+
+        String expected = "data:image/png;base64,R0lGODdh";
+        assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
+
+    }
+
+    @Test
+    public void testBackslashNewlines() throws Exception {
+        //like you'd have in a css fragment
+        String data = "data:image/png;base64,R0lG\\\nODdh";
+        DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
+        assertTrue(dataURIScheme.isBase64());
+        assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
+
+        String expected = "data:image/png;base64,R0lGODdh";
+        assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
+    }
+
+    @Test
+    public void testUTF8() throws Exception {
+        String utf8 = "\u0628\u0631\u0646\u0633\u062A\u0648\u0646";
+        String data = "data:text/plain;charset=UTF-8;page=21,the%20data:"+utf8;
+        DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        IOUtils.copy(dataURIScheme.getInputStream(), bos);
+        assertContains(utf8, new String(bos.toByteArray(), StandardCharsets.UTF_8));
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..38d351f
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
@@ -0,0 +1,378 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+public class StandardHtmlEncodingDetectorTest {
+    private Metadata metadata = new Metadata();
+
+    @Before
+    public void setUp() {
+        this.metadata = new Metadata();
+    }
+
+    @Test
+    public void basic() throws IOException {
+        assertWindows1252("<meta charset=WINDOWS-1252>");
+    }
+
+    @Test
+    public void quoted() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void duplicateMeta() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>" +
+                "<meta charset='UTF-8'>");
+    }
+
+    @Test
+    public void duplicateAttribute() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
+    }
+
+    @Test
+    public void invalidThenValid() throws IOException {
+        assertCharset("<meta charset=blah>" +
+                "<meta charset=WINDOWS-1252>", null);
+    }
+
+    @Test
+    public void spacesInAttributes() throws IOException {
+        assertWindows1252("<meta charset\u000C=  \t  WINDOWS-1252>");
+    }
+
+    @Test
+    public void httpEquiv() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
+        assertWindows1252("<meta " +
+                "content=' charset  =  WINDOWS-1252' " + // The charset may be anywhere in the content attribute
+                "http-equiv='content-type' >");
+    }
+
+    @Test
+    public void emptyAttributeEnd() throws IOException {
+        assertWindows1252("<meta charset=WINDOWS-1252 a>");
+    }
+
+    @Test
+    public void httpEquivDuplicateCharset() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
+                "charset=UTF-8'>");
+    }
+
+    @Test
+    public void htmlFragment() throws IOException {
+        assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void veryBadHtml() throws IOException {
+        // check that the parser is not confused by garbage before the declaration
+        assertWindows1252("<< l \" == / '=x\n >" +
+                "<!--> " +
+                "< <x'/ <=> " +
+                "<meta/>" +
+                "<meta>" +
+                "<a x/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void specialTag() throws IOException {
+        // special tags cannot have arguments, any '>' ends them
+        assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
+    }
+
... 65965 lines suppressed ...