You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/18 13:47:23 UTC
[tika] branch main updated: TIKA-3324 -- code cleanup for
checkstyle in tika-parsers-classic
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d93ba62 TIKA-3324 -- code cleanup for checkstyle in tika-parsers-classic
d93ba62 is described below
commit d93ba627d1cb55d335640870795b4206a2f8e750
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 18 09:46:18 2021 -0400
TIKA-3324 -- code cleanup for checkstyle in tika-parsers-classic
---
tika-parent/checkstyle.xml | 7 +
tika-parsers/tika-parsers-classic/pom.xml | 101 +-
.../apache/tika/detect/apple/BPListDetector.java | 66 +-
.../apache/tika/detect/apple/IWorkDetector.java | 18 +-
.../tika/parser/apple/AppleSingleFileParser.java | 60 +-
.../org/apache/tika/parser/apple/PListParser.java | 92 +-
.../tika/parser/iwork/AutoPageNumberUtils.java | 146 +-
.../tika/parser/iwork/IWorkPackageParser.java | 271 ++-
.../tika/parser/iwork/KeynoteContentHandler.java | 36 +-
.../tika/parser/iwork/NumbersContentHandler.java | 16 +-
.../tika/parser/iwork/PagesContentHandler.java | 436 ++---
.../parser/iwork/iwana/IWork13PackageParser.java | 198 +--
.../parser/iwork/iwana/IWork18PackageParser.java | 180 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../tika/parser/iwork/AutoPageNumberUtilsTest.java | 85 +-
.../apache/tika/parser/iwork/IWorkParserTest.java | 141 +-
.../tika/parser/iwork/iwana/IWork13ParserTest.java | 23 +-
.../org/apache/tika/parser/audio/AudioParser.java | 32 +-
.../org/apache/tika/parser/audio/MidiParser.java | 41 +-
.../org/apache/tika/parser/mp3/AudioFrame.java | 239 +--
.../java/org/apache/tika/parser/mp3/ID3Tags.java | 294 ++--
.../org/apache/tika/parser/mp3/ID3v1Handler.java | 103 +-
.../org/apache/tika/parser/mp3/ID3v22Handler.java | 71 +-
.../org/apache/tika/parser/mp3/ID3v23Handler.java | 31 +-
.../org/apache/tika/parser/mp3/ID3v24Handler.java | 35 +-
.../org/apache/tika/parser/mp3/ID3v2Frame.java | 418 +++--
.../org/apache/tika/parser/mp3/LyricsHandler.java | 82 +-
.../java/org/apache/tika/parser/mp3/MP3Frame.java | 2 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 200 ++-
.../org/apache/tika/parser/mp3/MpegStream.java | 445 +++--
.../apache/tika/parser/mp4/ISO6709Extractor.java | 26 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 167 +-
.../org/apache/tika/parser/video/FLVParser.java | 81 +-
.../apache/tika/parser/audio/AudioParserTest.java | 15 +-
.../apache/tika/parser/audio/MidiParserTest.java | 9 +-
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 151 +-
.../org/apache/tika/parser/mp3/MpegStreamTest.java | 93 +-
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 30 +-
.../apache/tika/parser/video/FLVParserTest.java | 7 +-
.../java/org/apache/tika/parser/dwg/DWGParser.java | 336 ++--
.../java/org/apache/tika/parser/prt/PRTParser.java | 413 ++---
.../org/apache/tika/parser/dwg/DWGParserTest.java | 99 +-
.../org/apache/tika/parser/prt/PRTParserTest.java | 135 +-
.../org/apache/tika/parser/asm/ClassParser.java | 17 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 45 +-
.../apache/tika/parser/code/SourceCodeParser.java | 36 +-
.../tika/parser/executable/ExecutableParser.java | 656 +++----
.../java/org/apache/tika/parser/mat/MatParser.java | 44 +-
.../org/apache/tika/parser/sas/SAS7BDATParser.java | 60 +-
.../apache/tika/parser/asm/ClassParserTest.java | 28 +-
.../tika/parser/code/SourceCodeParserTest.java | 56 +-
.../parser/executable/ExecutableParserTest.java | 32 +-
.../org/apache/tika/parser/mat/MatParserTest.java | 3 +-
.../apache/tika/parser/sas/SAS7BDATParserTest.java | 37 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 41 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 255 +--
.../apache/tika/parser/crypto/Pkcs7ParserTest.java | 3 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 16 +-
.../parser/digestutils/BouncyCastleDigester.java | 13 +-
.../tika/parser/digestutils/CommonsDigester.java | 69 +-
.../tika/parser/font/AdobeFontMetricParser.java | 167 +-
.../apache/tika/parser/font/TrueTypeParser.java | 27 +-
.../apache/tika/parser/font/FontParsersTest.java | 28 +-
.../sax/boilerpipe/BoilerpipeContentHandler.java | 46 +-
.../org/apache/tika/parser/html/DataURIScheme.java | 13 +-
.../parser/html/DataURISchemeParseException.java | 4 +-
.../apache/tika/parser/html/DataURISchemeUtil.java | 14 +-
.../apache/tika/parser/html/DefaultHtmlMapper.java | 122 +-
.../tika/parser/html/HtmlEncodingDetector.java | 66 +-
.../org/apache/tika/parser/html/HtmlHandler.java | 104 +-
.../org/apache/tika/parser/html/HtmlParser.java | 83 +-
.../tika/parser/html/XHTMLDowngradeHandler.java | 20 +-
.../html/charsetdetector/CharsetAliases.java | 55 +-
.../charsetdetector/CharsetDetectionResult.java | 12 +-
.../parser/html/charsetdetector/MetaProcessor.java | 18 +-
.../parser/html/charsetdetector/PreScanner.java | 83 +-
.../StandardHtmlEncodingDetector.java | 28 +-
.../charsets/XUserDefinedCharset.java | 8 +-
.../tika/parser/html/DataURISchemeParserTest.java | 19 +-
.../tika/parser/html/HtmlEncodingDetectorTest.java | 60 +-
.../apache/tika/parser/html/HtmlParserTest.java | 705 ++++----
.../html/StandardHtmlEncodingDetectorTest.java | 139 +-
.../tika/parser/image/AbstractImageParser.java | 46 +-
.../org/apache/tika/parser/image/BPGParser.java | 30 +-
.../org/apache/tika/parser/image/HeifParser.java | 33 +-
.../org/apache/tika/parser/image/ICNSParser.java | 55 +-
.../org/apache/tika/parser/image/ICNSType.java | 241 ++-
.../tika/parser/image/ImageMetadataExtractor.java | 159 +-
.../org/apache/tika/parser/image/ImageParser.java | 63 +-
.../org/apache/tika/parser/image/JpegParser.java | 12 +-
.../apache/tika/parser/image/MetadataFields.java | 5 +-
.../org/apache/tika/parser/image/PSDParser.java | 43 +-
.../org/apache/tika/parser/image/TiffParser.java | 11 +-
.../org/apache/tika/parser/image/WebPParser.java | 11 +-
.../apache/tika/parser/image/HeifParserTest.java | 16 +-
.../apache/tika/parser/image/ICNSParserTest.java | 33 +-
.../parser/image/ImageMetadataExtractorTest.java | 17 +-
.../apache/tika/parser/image/ImageParserTest.java | 110 +-
.../apache/tika/parser/image/JpegParserTest.java | 76 +-
.../apache/tika/parser/image/PSDParserTest.java | 13 +-
.../apache/tika/parser/image/WebPParserTest.java | 3 +-
.../apache/tika/parser/jdbc/AbstractDBParser.java | 33 +-
.../apache/tika/parser/jdbc/JDBCTableReader.java | 79 +-
.../apache/tika/parser/mailcommons/MailUtil.java | 9 +-
.../tika/parser/mailcommons/MailUtilTest.java | 11 +-
.../tika/parser/mail/MailContentHandler.java | 207 ++-
.../org/apache/tika/parser/mail/RFC822Parser.java | 30 +-
.../org/apache/tika/parser/mbox/MboxParser.java | 36 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 175 +-
.../apache/tika/parser/mbox/MboxParserTest.java | 25 +-
.../detect/microsoft/POIFSContainerDetector.java | 87 +-
.../detect/microsoft/ooxml/OPCPackageDetector.java | 222 ++-
.../microsoft/MSEmbeddedStreamTranslator.java | 24 +-
.../tika/parser/microsoft/AbstractListManager.java | 23 +-
.../parser/microsoft/AbstractOfficeParser.java | 61 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 57 +-
.../org/apache/tika/parser/microsoft/Cell.java | 3 +-
.../tika/parser/microsoft/CellDecorator.java | 3 +-
.../apache/tika/parser/microsoft/EMFParser.java | 78 +-
.../tika/parser/microsoft/ExcelExtractor.java | 142 +-
.../tika/parser/microsoft/FormattingUtils.java | 23 +-
.../tika/parser/microsoft/HSLFExtractor.java | 124 +-
.../tika/parser/microsoft/JackcessExtractor.java | 77 +-
.../tika/parser/microsoft/JackcessParser.java | 33 +-
.../apache/tika/parser/microsoft/LinkedCell.java | 3 +-
.../apache/tika/parser/microsoft/ListManager.java | 33 +-
.../tika/parser/microsoft/MSOwnerFileParser.java | 37 +-
.../apache/tika/parser/microsoft/NumberCell.java | 3 +-
.../apache/tika/parser/microsoft/OfficeParser.java | 146 +-
.../tika/parser/microsoft/OfficeParserConfig.java | 69 +-
.../tika/parser/microsoft/OldExcelParser.java | 23 +-
.../tika/parser/microsoft/OutlookExtractor.java | 361 ++--
.../tika/parser/microsoft/SummaryExtractor.java | 68 +-
.../apache/tika/parser/microsoft/TNEFParser.java | 44 +-
.../org/apache/tika/parser/microsoft/TextCell.java | 3 +-
.../parser/microsoft/TikaExcelDataFormatter.java | 11 +-
.../parser/microsoft/TikaExcelGeneralFormat.java | 2 +-
.../apache/tika/parser/microsoft/WMFParser.java | 24 +-
.../tika/parser/microsoft/WordExtractor.java | 98 +-
.../tika/parser/microsoft/chm/ChmAccessor.java | 10 +-
.../tika/parser/microsoft/chm/ChmAssert.java | 139 +-
.../tika/parser/microsoft/chm/ChmBlockInfo.java | 103 +-
.../tika/parser/microsoft/chm/ChmCommons.java | 293 ++--
.../tika/parser/microsoft/chm/ChmConstants.java | 54 +-
.../microsoft/chm/ChmDirectoryListingSet.java | 234 ++-
.../tika/parser/microsoft/chm/ChmExtractor.java | 284 +--
.../tika/parser/microsoft/chm/ChmItsfHeader.java | 192 ++-
.../tika/parser/microsoft/chm/ChmItspHeader.java | 271 ++-
.../tika/parser/microsoft/chm/ChmLzxBlock.java | 455 +++--
.../tika/parser/microsoft/chm/ChmLzxState.java | 262 ++-
.../parser/microsoft/chm/ChmLzxcControlData.java | 147 +-
.../parser/microsoft/chm/ChmLzxcResetTable.java | 129 +-
.../tika/parser/microsoft/chm/ChmParser.java | 39 +-
.../tika/parser/microsoft/chm/ChmPmgiHeader.java | 51 +-
.../tika/parser/microsoft/chm/ChmPmglHeader.java | 62 +-
.../tika/parser/microsoft/chm/ChmSection.java | 61 +-
.../tika/parser/microsoft/chm/ChmWrapper.java | 12 +-
.../microsoft/chm/DirectoryListingEntry.java | 41 +-
.../tika/parser/microsoft/onenote/CompactID.java | 9 +-
.../tika/parser/microsoft/onenote/Error.java | 11 +-
.../parser/microsoft/onenote/ExtendedGUID.java | 11 +-
.../microsoft/onenote/FileChunkReference.java | 20 +-
.../tika/parser/microsoft/onenote/FileNode.java | 102 +-
.../microsoft/onenote/FileNodeListHeader.java | 32 +-
.../tika/parser/microsoft/onenote/FileNodePtr.java | 3 +-
.../parser/microsoft/onenote/FileNodeUnion.java | 33 +-
.../microsoft/onenote/FndStructureConstants.java | 44 +-
.../apache/tika/parser/microsoft/onenote/GUID.java | 65 +-
.../apache/tika/parser/microsoft/onenote/JCID.java | 38 +-
.../microsoft/onenote/JCIDPropertySetTypeEnum.java | 76 +-
.../onenote/ObjectDeclarationWithRefCount.java | 27 +-
.../onenote/ObjectDeclarationWithRefCountBody.java | 3 +-
.../onenote/ObjectSpaceObjectPropSet.java | 12 +-
...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java | 6 +-
.../onenote/OneNoteDirectFileResource.java | 9 +-
.../parser/microsoft/onenote/OneNoteDocument.java | 13 +-
.../parser/microsoft/onenote/OneNoteHeader.java | 18 +-
.../onenote/OneNoteLegacyDumpStrings.java | 47 +-
.../parser/microsoft/onenote/OneNoteParser.java | 149 +-
.../microsoft/onenote/OneNotePropertyEnum.java | 208 +--
.../microsoft/onenote/OneNotePropertyId.java | 7 +-
.../tika/parser/microsoft/onenote/OneNotePtr.java | 515 +++---
.../microsoft/onenote/OneNoteTreeWalker.java | 215 ++-
.../onenote/OneNoteTreeWalkerOptions.java | 14 +-
.../parser/microsoft/onenote/PropertyIDType.java | 7 +-
.../tika/parser/microsoft/onenote/PropertySet.java | 37 +-
.../parser/microsoft/onenote/PropertyValue.java | 20 +-
.../tika/parser/microsoft/onenote/Revision.java | 23 +-
.../microsoft/onenote/RootObjectReference.java | 3 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 168 +-
.../parser/microsoft/ooxml/MetadataExtractor.java | 79 +-
.../parser/microsoft/ooxml/OOXMLExtractor.java | 10 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 81 +-
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 60 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 72 +-
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 110 +-
.../ooxml/POIXMLTextExtractorDecorator.java | 8 +-
.../microsoft/ooxml/ParagraphProperties.java | 18 +-
.../tika/parser/microsoft/ooxml/RunProperties.java | 9 +-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 258 ++-
.../ooxml/SXWPFWordExtractorDecorator.java | 93 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 95 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 48 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 181 +-
.../parser/microsoft/ooxml/XWPFListManager.java | 21 +-
.../ooxml/XWPFWordExtractorDecorator.java | 124 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 117 +-
.../microsoft/ooxml/xps/XPSPageContentHandler.java | 98 +-
.../microsoft/ooxml/xps/XPSTextExtractor.java | 7 +-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 16 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 63 +-
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 28 +-
.../ooxml/xwpf/ml2006/AbstractPartHandler.java | 11 +-
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 12 +-
.../ooxml/xwpf/ml2006/CorePropertiesHandler.java | 17 +-
.../xwpf/ml2006/ExtendedPropertiesHandler.java | 3 +-
.../microsoft/ooxml/xwpf/ml2006/PartHandler.java | 7 +-
.../ooxml/xwpf/ml2006/RelationshipsHandler.java | 5 +-
.../ooxml/xwpf/ml2006/RelationshipsManager.java | 3 +-
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 40 +-
.../ooxml/xwpf/ml2006/Word2006MLParser.java | 21 +-
.../ml2006/WordAndPowerPointTextPartHandler.java | 17 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 62 +-
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 40 +-
.../parser/microsoft/rtf/RTFObjDataParser.java | 62 +-
.../tika/parser/microsoft/rtf/RTFParser.java | 53 +-
.../tika/parser/microsoft/rtf/TextExtractor.java | 93 +-
.../microsoft/xml/AbstractXML2003Parser.java | 41 +-
.../parser/microsoft/xml/HyperlinkHandler.java | 23 +-
.../parser/microsoft/xml/SpreadsheetMLParser.java | 50 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 75 +-
.../AbstractPOIContainerExtractionTest.java | 16 +-
.../tika/parser/microsoft/EMFParserTest.java | 13 +-
.../tika/parser/microsoft/ExcelParserTest.java | 69 +-
.../tika/parser/microsoft/JackcessParserTest.java | 40 +-
.../parser/microsoft/MSOwnerFileParserTest.java | 7 +-
.../tika/parser/microsoft/OfficeParserTest.java | 4 +-
.../tika/parser/microsoft/OldExcelParserTest.java | 13 +-
.../tika/parser/microsoft/OutlookParserTest.java | 83 +-
.../microsoft/POIContainerExtractionTest.java | 19 +-
.../parser/microsoft/PowerPointParserTest.java | 47 +-
.../tika/parser/microsoft/ProjectParserTest.java | 27 +-
.../tika/parser/microsoft/PublisherParserTest.java | 13 +-
.../parser/microsoft/SolidworksParserTest.java | 46 +-
.../tika/parser/microsoft/TNEFParserTest.java | 9 +-
.../tika/parser/microsoft/VisioParserTest.java | 13 +-
.../tika/parser/microsoft/WMFParserTest.java | 9 +-
.../tika/parser/microsoft/WordParserTest.java | 123 +-
.../parser/microsoft/WriteProtectedParserTest.java | 9 +-
.../parser/microsoft/chm/TestChmBlockInfo.java | 50 +-
.../parser/microsoft/chm/TestChmExtraction.java | 161 +-
.../parser/microsoft/chm/TestChmExtractor.java | 16 +-
.../parser/microsoft/chm/TestChmItsfHeader.java | 40 +-
.../parser/microsoft/chm/TestChmItspHeader.java | 60 +-
.../tika/parser/microsoft/chm/TestChmLzxState.java | 37 +-
.../microsoft/chm/TestChmLzxcControlData.java | 54 +-
.../microsoft/chm/TestChmLzxcResetTable.java | 59 +-
.../microsoft/chm/TestDirectoryListingEntry.java | 9 +-
.../tika/parser/microsoft/chm/TestParameters.java | 34 +-
.../tika/parser/microsoft/chm/TestPmglHeader.java | 24 +-
.../microsoft/onenote/OneNoteParserTest.java | 83 +-
.../ooxml/OOXMLContainerExtractionTest.java | 24 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 274 ++-
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 122 +-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 97 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 41 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 30 +-
.../ooxml/xwpf/ml2006/Word2006MLParserTest.java | 29 +-
.../parser/microsoft/pst/OutlookPSTParserTest.java | 84 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 91 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 35 +-
.../apache/tika/detect/ole/MiscOLEDetector.java | 69 +-
.../java/org/apache/tika/parser/dbf/DBFCell.java | 30 +-
.../apache/tika/parser/dbf/DBFColumnHeader.java | 68 +-
.../org/apache/tika/parser/dbf/DBFFileHeader.java | 46 +-
.../java/org/apache/tika/parser/dbf/DBFParser.java | 34 +-
.../java/org/apache/tika/parser/dbf/DBFReader.java | 167 +-
.../java/org/apache/tika/parser/dbf/DBFRow.java | 16 +-
.../apache/tika/parser/dif/DIFContentHandler.java | 241 ++-
.../java/org/apache/tika/parser/dif/DIFParser.java | 85 +-
.../apache/tika/parser/epub/EpubContentParser.java | 29 +-
.../org/apache/tika/parser/epub/EpubParser.java | 124 +-
.../apache/tika/parser/hwp/HwpStreamReader.java | 2 +-
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 111 +-
.../org/apache/tika/parser/hwp/HwpV5Parser.java | 16 +-
.../apache/tika/parser/mif/MIFContentHandler.java | 17 +-
.../org/apache/tika/parser/mif/MIFExtractor.java | 34 +-
.../java/org/apache/tika/parser/mif/MIFParser.java | 42 +-
.../parser/odf/FlatOpenDocumentMacroHandler.java | 43 +-
.../tika/parser/odf/FlatOpenDocumentParser.java | 101 +-
.../parser/odf/NSNormalizerContentHandler.java | 29 +-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 311 ++--
.../tika/parser/odf/OpenDocumentContentParser.java | 40 +-
.../tika/parser/odf/OpenDocumentMacroHandler.java | 16 +-
.../tika/parser/odf/OpenDocumentMetaParser.java | 101 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 63 +-
.../tika/parser/wordperfect/QPWTextExtractor.java | 251 +--
.../tika/parser/wordperfect/QuattroProParser.java | 34 +-
.../tika/parser/wordperfect/WP5Charsets.java | 289 ++--
.../wordperfect/WP5DocumentAreaExtractor.java | 66 +-
.../tika/parser/wordperfect/WP6Charsets.java | 750 ++++----
.../wordperfect/WP6DocumentAreaExtractor.java | 58 +-
.../wordperfect/WPDocumentAreaExtractor.java | 23 +-
.../tika/parser/wordperfect/WPInputStream.java | 25 +-
.../tika/parser/wordperfect/WPPrefixArea.java | 37 +-
.../parser/wordperfect/WPPrefixAreaExtractor.java | 10 +-
.../tika/parser/wordperfect/WordPerfectParser.java | 78 +-
.../org/apache/tika/parser/dbf/DBFParserTest.java | 36 +-
.../org/apache/tika/parser/dif/DIFParserTest.java | 25 +-
.../apache/tika/parser/epub/EpubParserTest.java | 30 +-
.../apache/tika/parser/hwp/HwpV5ParserTest.java | 17 +-
.../tika/parser/ibooks/iBooksParserTest.java | 18 +-
.../org/apache/tika/parser/mif/MIFParserTest.java | 9 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 207 ++-
.../tika/parser/wordperfect/QuattroProTest.java | 12 +-
.../tika/parser/wordperfect/WPInputStreamTest.java | 14 +-
.../tika/parser/wordperfect/WordPerfectTest.java | 20 +-
.../org/apache/tika/parser/feed/FeedParser.java | 76 +-
.../apache/tika/parser/iptc/IptcAnpaParser.java | 1404 +++++++--------
.../apache/tika/parser/feed/FeedParserTest.java | 23 +-
.../apache/tika/parser/ocr/ImagePreprocessor.java | 67 +-
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 197 ++-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 352 ++--
.../apache/tika/parser/ocr/tess4j/ImageDeskew.java | 10 +-
.../apache/tika/parser/ocr/tess4j/ImageUtil.java | 17 +-
.../tika/parser/ocr/TesseractOCRConfigTest.java | 149 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 101 +-
.../resources/test-configs/TIKA-2705-tesseract.xml | 26 +-
.../tika-config-tesseract-arbitrary.xml | 22 +-
.../test-configs/tika-config-tesseract-full.xml | 38 +-
.../tika-config-tesseract-load-langs.xml | 20 +-
.../test-configs/tika-config-tesseract-partial.xml | 32 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 417 ++---
.../org/apache/tika/parser/pdf/AccessChecker.java | 18 +-
.../tika/parser/pdf/ImageGraphicsEngine.java | 290 ++--
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 20 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 97 +-
.../tika/parser/pdf/PDFEncodedStringDecoder.java | 6 +-
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 207 +--
.../java/org/apache/tika/parser/pdf/PDFParser.java | 133 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 360 ++--
.../apache/tika/parser/pdf/PDFPreflightParser.java | 82 +-
.../tika/parser/pdf/PDMetadataExtractor.java | 54 +-
.../org/apache/tika/parser/pdf/XFAExtractor.java | 75 +-
.../apache/tika/parser/pdf/AccessCheckerTest.java | 6 +-
.../parser/pdf/PDFMarkedContent2XHTMLTest.java | 21 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 315 ++--
.../tika/parser/pdf/PDFPreflightParserTest.java | 18 +-
.../apache/tika/parser/pkg/CompressorParser.java | 54 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 223 ++-
.../java/org/apache/tika/parser/pkg/RarParser.java | 30 +-
.../apache/tika/parser/pkg/AbstractPkgTest.java | 94 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 11 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 37 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 39 +-
.../tika/parser/pkg/CompressorParserTest.java | 21 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 29 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 19 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 99 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 69 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 67 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 98 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 33 +-
.../src/test/resources/test-documents/testSVG.svg | 8 +-
.../java/org/apache/tika/parser/csv/CSVParams.java | 4 +-
.../java/org/apache/tika/parser/csv/CSVResult.java | 17 +-
.../org/apache/tika/parser/csv/CSVSniffer.java | 84 +-
.../apache/tika/parser/csv/TextAndCSVParser.java | 169 +-
.../tika/parser/strings/Latin1StringsParser.java | 145 +-
.../apache/tika/parser/strings/StringsConfig.java | 163 +-
.../tika/parser/strings/StringsEncoding.java | 62 +-
.../apache/tika/parser/strings/StringsParser.java | 495 +++---
.../apache/tika/parser/txt/CharsetDetector.java | 46 +-
.../org/apache/tika/parser/txt/CharsetMatch.java | 10 +-
.../apache/tika/parser/txt/CharsetRecog_2022.java | 20 +-
.../apache/tika/parser/txt/CharsetRecog_UTF8.java | 8 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 2 +-
.../apache/tika/parser/txt/CharsetRecog_mbcs.java | 113 +-
.../apache/tika/parser/txt/CharsetRecog_sbcs.java | 1801 ++++++++++++--------
.../apache/tika/parser/txt/CharsetRecognizer.java | 2 +-
.../tika/parser/txt/Icu4jEncodingDetector.java | 17 +-
.../java/org/apache/tika/parser/txt/TXTParser.java | 26 +-
.../tika/parser/txt/UniversalEncodingDetector.java | 16 +-
.../tika/parser/txt/UniversalEncodingListener.java | 21 +-
.../org/apache/tika/parser/csv/CSVSnifferTest.java | 51 +-
.../tika/parser/csv/TextAndCSVParserTest.java | 112 +-
.../parser/strings/Latin1StringsParserTest.java | 28 +-
.../tika/parser/strings/StringsConfigTest.java | 111 +-
.../tika/parser/strings/StringsParserTest.java | 89 +-
.../tika/parser/txt/CharsetDetectorTest.java | 21 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 126 +-
.../test-configs/tika-config-strings-full.xml | 18 +-
.../test-configs/tika-config-strings-partial.xml | 16 +-
.../src/test/resources/test-documents/resume.html | 140 +-
.../tika/parser/xliff/XLIFF12ContentHandler.java | 15 +-
.../apache/tika/parser/xliff/XLIFF12Parser.java | 27 +-
.../org/apache/tika/parser/xliff/XLZParser.java | 43 +-
.../tika/parser/xml/AbstractMetadataHandler.java | 46 +-
.../xml/AttributeDependantMetadataHandler.java | 34 +-
.../tika/parser/xml/AttributeMetadataHandler.java | 28 +-
.../org/apache/tika/parser/xml/DcXMLParser.java | 22 +-
.../tika/parser/xml/ElementMetadataHandler.java | 69 +-
.../apache/tika/parser/xml/FictionBookParser.java | 33 +-
.../apache/tika/parser/xml/MetadataHandler.java | 33 +-
.../tika/parser/xml/TextAndAttributeXMLParser.java | 6 +-
.../java/org/apache/tika/parser/xml/XMLParser.java | 39 +-
.../org/apache/tika/parser/xml/XMLProfiler.java | 99 +-
.../tika/parser/xliff/XLIFF12ParserTest.java | 5 +-
.../apache/tika/parser/xliff/XLZParserTest.java | 18 +-
.../apache/tika/parser/xml/DcXMLParserTest.java | 27 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 56 +-
.../tika/parser/xml/FictionBookParserTest.java | 10 +-
.../parser/xml/TextAndAttributeXMLParserTest.java | 21 +-
.../src/test/resources/test-documents/testXML.xml | 30 +-
.../src/test/resources/test-documents/testXML2.xml | 10 +-
.../src/test/resources/test-documents/testXML3.xml | 38 +-
.../apache/tika/parser/xmp/JempboxExtractor.java | 91 +-
.../apache/tika/parser/xmp/XMPPacketScanner.java | 4 +-
.../tika/parser/xmp/JempboxExtractorTest.java | 31 +-
.../src/test/resources/test-documents/testXMP.xmp | 342 ++--
.../tika/detect/zip/CompressorConstants.java | 3 +-
.../detect/zip/DefaultZipContainerDetector.java | 145 +-
.../DeprecatedStreamingZipContainerDetector.java | 37 +-
.../detect/zip/DeprecatedZipContainerDetector.java | 3 -
.../org/apache/tika/detect/zip/IPADetector.java | 21 +-
.../org/apache/tika/detect/zip/JarDetector.java | 14 +-
.../org/apache/tika/detect/zip/KMZDetector.java | 22 +-
.../tika/detect/zip/OpenDocumentDetector.java | 20 +-
.../apache/tika/detect/zip/PackageConstants.java | 1 +
.../apache/tika/detect/zip/StarOfficeDetector.java | 78 +-
.../tika/detect/zip/StreamingDetectContext.java | 16 +-
.../detect/zip/StreamingZipContainerDetector.java | 13 +-
.../tika/detect/zip/ZipContainerDetector.java | 16 +-
.../tika/detect/zip/ZipContainerDetectorBase.java | 47 +-
.../org/apache/tika/zip/utils/ZipSalvager.java | 13 +-
.../org/apache/tika/detect/zip/ZipParserTest.java | 14 +-
.../org/apache/tika/parser/internal/Activator.java | 22 +-
.../apache/tika/config/TikaDetectorConfigTest.java | 87 +-
.../tika/config/TikaEncodingDetectorTest.java | 82 +-
.../apache/tika/config/TikaParserConfigTest.java | 69 +-
.../tika/config/TikaTranslatorConfigTest.java | 21 +-
.../tika/detect/TestContainerAwareDetector.java | 278 ++-
.../apache/tika/detect/TestDetectorLoading.java | 15 +-
.../tika/detect/TestFileCommandDetector.java | 12 +-
.../tika/extractor/EmbeddedDocumentUtilTest.java | 3 +-
.../java/org/apache/tika/mime/MimeTypeTest.java | 12 +-
.../java/org/apache/tika/mime/MimeTypesTest.java | 4 +-
.../java/org/apache/tika/mime/TestMimeTypes.java | 733 ++++----
.../apache/tika/parser/AutoDetectParserTest.java | 357 ++--
.../tika/parser/AutoDetectReaderParserTest.java | 24 +-
.../parser/BouncyCastleDigestingParserTest.java | 125 +-
.../apache/tika/parser/DigestingParserTest.java | 120 +-
.../org/apache/tika/parser/ParsingReaderTest.java | 13 +-
.../tika/parser/RecursiveParserWrapperTest.java | 61 +-
.../org/apache/tika/parser/TabularFormatsTest.java | 252 ++-
.../java/org/apache/tika/parser/TestParsers.java | 49 +-
.../apache/tika/parser/TestXMLEntityExpansion.java | 90 +-
.../java/org/apache/tika/parser/TestXXEInXML.java | 115 +-
.../java/org/apache/tika/parser/XMLTestBase.java | 80 +-
.../parser/apple/AppleSingleFileParserTest.java | 8 +-
.../apache/tika/parser/apple/PListParserTest.java | 11 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 19 +-
.../parser/fork/ForkParserIntegrationTest.java | 285 ++--
.../apache/tika/parser/html/HtmlParserTest.java | 20 +-
.../apache/tika/parser/mail/MboxParserTest.java | 16 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 42 +-
.../tika/parser/microsoft/EMFParserTest.java | 17 +-
.../tika/parser/microsoft/ExcelParserTest.java | 7 +-
.../microsoft/POIContainerExtractionTest.java | 9 +-
.../parser/microsoft/PowerPointParserTest.java | 14 +-
.../tika/parser/microsoft/XML2003ParserTest.java | 25 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 17 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 60 +-
.../apache/tika/parser/mock/MockParserTest.java | 70 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 54 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 54 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 100 +-
.../org/apache/tika/parser/pkg/ArParserTest.java | 17 +-
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 11 +-
.../pkg/CompositeZipContainerDetectorTest.java | 141 +-
.../apache/tika/parser/pkg/CompressParserTest.java | 25 +-
.../tika/parser/pkg/CompressorParserTest.java | 17 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 13 +-
.../org/apache/tika/parser/pkg/RarParserTest.java | 16 +-
.../apache/tika/parser/pkg/Seven7ParserTest.java | 64 +-
.../org/apache/tika/parser/pkg/TarParserTest.java | 11 +-
.../org/apache/tika/parser/pkg/ZipParserTest.java | 65 +-
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 11 +-
.../tika/parser/xml/FictionBookParserTest.java | 14 +-
.../sax/PhoneExtractingContentHandlerTest.java | 20 +-
.../sax/StandardsExtractingContentHandlerTest.java | 47 +-
.../apache/tika/utils/ServiceLoaderUtilsTest.java | 6 +-
.../src/test/resources/log4j.properties | 1 +
.../test-documents/testJAVAPROPS.properties | 1 +
495 files changed, 20227 insertions(+), 19885 deletions(-)
diff --git a/tika-parent/checkstyle.xml b/tika-parent/checkstyle.xml
index f6aab46..93a68c4 100644
--- a/tika-parent/checkstyle.xml
+++ b/tika-parent/checkstyle.xml
@@ -23,6 +23,13 @@
-->
<module name = "Checker">
+
+ <!-- TODO: move this into tika-parsers-class-module/tika-parsers-text-module -->
+ <module name="SuppressionSingleFilter">
+ <property name="checks" value="RegexpHeader"/>
+ <property name="files" value="(CharsetMatch.java)|(CharsetDetector.java)|(CharsetRecog.*.java)"/>
+ </module>
+
<property name="charset" value="UTF-8"/>
<property name="severity" value="error"/>
diff --git a/tika-parsers/tika-parsers-classic/pom.xml b/tika-parsers/tika-parsers-classic/pom.xml
index d4581d2..9ee9358 100644
--- a/tika-parsers/tika-parsers-classic/pom.xml
+++ b/tika-parsers/tika-parsers-classic/pom.xml
@@ -17,7 +17,8 @@
specific language governing permissions and limitations
under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>tika-parsers</artifactId>
<groupId>org.apache.tika</groupId>
@@ -38,49 +39,81 @@
</modules>
<build>
<pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <version>${rat.version}</version>
+ <configuration>
+ <excludes>
+ <exclude>src/test/resources/test-documents/**</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
+ </archive>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <configuration>
+ <instructions>
+ <Bundle-DocURL>${project.url}</Bundle-DocURL>
+ <Import-Package>
+ org.apache.tika.*,
+ *;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ <executions>
+ <execution>
+ <id>bundle-manifest</id>
+ <phase>process-classes</phase>
+ <goals>
+ <goal>manifest</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
<plugins>
<plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <version>${rat.version}</version>
- <configuration>
- <excludes>
- <exclude>src/test/resources/test-documents/**</exclude>
- </excludes>
- </configuration>
- </plugin>
- <plugin>
<groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
- </archive>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <configuration>
- <instructions>
- <Bundle-DocURL>${project.url}</Bundle-DocURL>
- <Import-Package>
- org.apache.tika.*,
- *;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ <version>${checkstyle.plugin.version}</version>
+ <dependencies>
+ <dependency>
+ <groupId>com.puppycrawl.tools</groupId>
+ <artifactId>checkstyle</artifactId>
+ <version>8.41</version>
+ </dependency>
+ </dependencies>
<executions>
<execution>
- <id>bundle-manifest</id>
- <phase>process-classes</phase>
+ <id>validate</id>
+ <phase>validate</phase>
+ <configuration>
+ <configLocation>checkstyle.xml</configLocation>
+ <encoding>UTF-8</encoding>
+ <consoleOutput>false</consoleOutput>
+ <includeTestSourceDirectory>true</includeTestSourceDirectory>
+ <testSourceDirectories>${project.basedir}/src/test/java</testSourceDirectories>
+ <violationSeverity>error</violationSeverity>
+ <failOnViolation>true</failOnViolation>
+ </configuration>
<goals>
- <goal>manifest</goal>
+ <goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
- </pluginManagement>
</build>
</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java
index 86c7ea3..bdbd19c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java
@@ -16,31 +16,33 @@
*/
package org.apache.tika.detect.apple;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import javax.xml.parsers.ParserConfigurationException;
+
import com.dd.plist.NSDictionary;
import com.dd.plist.NSObject;
import com.dd.plist.PropertyListFormatException;
import com.dd.plist.PropertyListParser;
import org.apache.commons.io.IOUtils;
+import org.xml.sax.SAXException;
+
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.xml.sax.SAXException;
-
-import javax.xml.parsers.ParserConfigurationException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
/**
* Detector for BPList with utility functions for PList.
- *
+ * <p>
* Without significant refactoring, this can't easily work as a true
* detector on plist subtypes. Rather, for now, we require the file to be
* parsed and then the parser adds the subtype for xml-based plists.
+ *
* @since 1.25
*/
public class BPListDetector implements Detector {
@@ -67,6 +69,25 @@ public class BPListDetector implements Detector {
BINARY_TO_XML.put(BITUNES, ITUNES);
}
+ public static MediaType detectOnKeys(Set<String> keySet) {
+ if (keySet.contains("nodes") && keySet.contains("edges") &&
+ keySet.contains("graphEncodingVersion")) {
+ return BMEMGRAPH;
+ } else if (keySet.contains(
+ "WebMainResource")) { //&& keySet.contains ("WebSubresources") should we require
+ // this?
+ return BWEBARCHIVE;
+ } else if (keySet.contains("Playlists") && keySet.contains("Tracks") &&
+ keySet.contains("Music Folder")) {
+ return BITUNES;
+ } //if it contains $archiver and $objects, it is a bplist inside a webarchive
+ return BPLIST;
+ }
+
+ public static MediaType detectXMLOnKeys(Set<String> keySet) {
+ return BINARY_TO_XML.get(detectOnKeys(keySet));
+ }
+
/**
* @param input input stream must support reset
* @param metadata input metadata for the document
@@ -93,9 +114,8 @@ public class BPListDetector implements Detector {
}
int i = 0;
- if (bytes[i++] != 'b' || bytes[i++] != 'p'
- || bytes[i++] != 'l' || bytes[i++] != 'i'
- || bytes[i++] != 's' || bytes[i++] != 't') {
+ if (bytes[i++] != 'b' || bytes[i++] != 'p' || bytes[i++] != 'l' || bytes[i++] != 'i' ||
+ bytes[i++] != 's' || bytes[i++] != 't') {
return MediaType.OCTET_STREAM;
}
//TODO: extract the version with the next two bytes if they were read
@@ -109,7 +129,8 @@ public class BPListDetector implements Detector {
if (input instanceof TikaInputStream) {
((TikaInputStream) input).setOpenContainer(rootObj);
}
- } catch (PropertyListFormatException | ParseException | ParserConfigurationException | SAXException e) {
+ } catch (PropertyListFormatException | ParseException |
+ ParserConfigurationException | SAXException e) {
throw new IOException("problem parsing root", e);
}
if (rootObj instanceof NSDictionary) {
@@ -117,21 +138,4 @@ public class BPListDetector implements Detector {
}
return BPLIST;
}
-
- public static MediaType detectOnKeys(Set<String> keySet) {
- if (keySet.contains("nodes") && keySet.contains("edges")
- && keySet.contains("graphEncodingVersion")) {
- return BMEMGRAPH;
- } else if (keySet.contains("WebMainResource")){ //&& keySet.contains("WebSubresources") should we require this?
- return BWEBARCHIVE;
- } else if (keySet.contains("Playlists") && keySet.contains("Tracks")
- && keySet.contains("Music Folder")) {
- return BITUNES;
- } //if it contains $archiver and $objects, it is a bplist inside a webarchive
- return BPLIST;
- }
-
- public static MediaType detectXMLOnKeys(Set<String> keySet) {
- return BINARY_TO_XML.get(detectOnKeys(keySet));
- }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java
index fb9f1bb..0560302 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java
@@ -16,8 +16,14 @@
*/
package org.apache.tika.detect.apple;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
+
import org.apache.tika.detect.zip.StreamingDetectContext;
import org.apache.tika.detect.zip.ZipContainerDetector;
import org.apache.tika.io.TikaInputStream;
@@ -26,11 +32,6 @@ import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-
public class IWorkDetector implements ZipContainerDetector {
@@ -51,8 +52,8 @@ public class IWorkDetector implements ZipContainerDetector {
// the root element of the document. That is used to the identify
// the correct type of the keynote container.
for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
- IWorkPackageParser.IWORKDocumentType type =
- IWorkPackageParser.IWORKDocumentType.detectType(zip.getEntry(entryName), zip);
+ IWorkPackageParser.IWORKDocumentType type = IWorkPackageParser.IWORKDocumentType
+ .detectType(zip.getEntry(entryName), zip);
if (type != null) {
return type.getType();
}
@@ -79,7 +80,8 @@ public class IWorkDetector implements ZipContainerDetector {
}
@Override
- public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis, StreamingDetectContext detectContext) {
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis,
+ StreamingDetectContext detectContext) {
String name = zae.getName();
EntryNames entryNames = detectContext.get(EntryNames.class);
if (entryNames == null) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index a0b8a3f..9226a5d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -16,8 +16,20 @@
*/
package org.apache.tika.parser.apple;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -29,17 +41,6 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Set;
/**
* Parser that strips the header off of AppleSingle and AppleDouble
@@ -77,30 +78,29 @@ public class AppleSingleFileParser extends AbstractParser {
}
@Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
short numEntries = readThroughNumEntries(stream);
long bytesRead = 26;
List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
- bytesRead += 12*numEntries;
+ bytesRead += 12 * numEntries;
Metadata embeddedMetadata = new Metadata();
bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (contentFieldInfo != null) {
- long diff = contentFieldInfo.offset-bytesRead;
+ long diff = contentFieldInfo.offset - bytesRead;
IOUtils.skipFully(stream, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
// TODO: we should probably add a readlimiting wrapper around this
// stream to ensure that not more than contentFieldInfo.length bytes
// are read
- ex.parseEmbedded(new CloseShieldInputStream(stream),
- xhtml, embeddedMetadata, false);
+ ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata,
+ false);
}
}
xhtml.endDocument();
@@ -117,7 +117,8 @@ public class AppleSingleFileParser extends AbstractParser {
}
private long processFieldEntries(InputStream stream, List<FieldInfo> fieldInfoList,
- Metadata embeddedMetadata, long bytesRead) throws IOException, TikaException {
+ Metadata embeddedMetadata, long bytesRead)
+ throws IOException, TikaException {
byte[] buffer = null;
for (FieldInfo f : fieldInfoList) {
long diff = f.offset - bytesRead;
@@ -128,10 +129,11 @@ public class AppleSingleFileParser extends AbstractParser {
if (f.length > MAX_FIELD_LENGTH) {
throw new TikaMemoryLimitException(f.length, MAX_FIELD_LENGTH);
}
- buffer = new byte[(int)f.length];
+ buffer = new byte[(int) f.length];
IOUtils.readFully(stream, buffer);
bytesRead += f.length;
- String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
+ String originalFileName =
+ new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
} else if (f.entryId != DATA_FORK) {
IOUtils.skipFully(stream, f.length);
@@ -142,19 +144,17 @@ public class AppleSingleFileParser extends AbstractParser {
}
- private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries) throws IOException, TikaException {
+ private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries)
+ throws IOException, TikaException {
//this is probably overkill. I'd hope that these were already
//in order. This ensures it.
List<FieldInfo> fieldInfoList = new ArrayList<>(numEntries);
for (int i = 0; i < numEntries; i++) {
//convert 32-bit unsigned ints to longs
- fieldInfoList.add(
- new FieldInfo(
- EndianUtils.readUIntBE(stream), //entry id
- EndianUtils.readUIntBE(stream), //offset
- EndianUtils.readUIntBE(stream) //length
- )
- );
+ fieldInfoList.add(new FieldInfo(EndianUtils.readUIntBE(stream), //entry id
+ EndianUtils.readUIntBE(stream), //offset
+ EndianUtils.readUIntBE(stream) //length
+ ));
}
if (fieldInfoList.size() == 0) {
throw new TikaException("AppleSingleFile missing field info");
@@ -171,7 +171,7 @@ public class AppleSingleFileParser extends AbstractParser {
//version
long version = EndianUtils.readIntBE(stream);
if (version != 0x00020000) {
- throw new TikaException("Version should have been 0x00020000, but was:"+version);
+ throw new TikaException("Version should have been 0x00020000, but was:" + version);
}
IOUtils.skipFully(stream, 16);//filler
return EndianUtils.readShortBE(stream);//number of entries
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
index 6ed4707..8f0537d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -16,6 +16,19 @@
*/
package org.apache.tika.parser.apple;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import javax.xml.parsers.ParserConfigurationException;
+
import com.dd.plist.NSArray;
import com.dd.plist.NSData;
import com.dd.plist.NSDate;
@@ -27,6 +40,9 @@ import com.dd.plist.NSString;
import com.dd.plist.PropertyListFormatException;
import com.dd.plist.PropertyListParser;
import com.dd.plist.UID;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.detect.apple.BPListDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -37,26 +53,11 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import javax.xml.parsers.ParserConfigurationException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
/**
* Parser for Apple's plist and bplist. This is a wrapper around
- * com.googlecode.plist:dd-plist
- *
+ * com.googlecode.plist:dd-plist
+ * <p>
* As of 1.25, Tika does not have detection for the text based plist,
* so those files will not be directed to this parser
*
@@ -76,13 +77,9 @@ public class PListParser extends AbstractParser {
private static final String UID = "uid";
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
- BPListDetector.BITUNES,
- BPListDetector.BMEMGRAPH,
- BPListDetector.BPLIST,
- BPListDetector.BWEBARCHIVE,
- BPListDetector.PLIST)));
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(
+ Arrays.asList(BPListDetector.BITUNES, BPListDetector.BMEMGRAPH, BPListDetector.BPLIST,
+ BPListDetector.BWEBARCHIVE, BPListDetector.PLIST)));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -100,7 +97,7 @@ public class PListParser extends AbstractParser {
//if this already went through the PListDetector,
//there should be an NSObject in the open container
if (stream instanceof TikaInputStream) {
- rootObj = (NSObject) ((TikaInputStream)stream).getOpenContainer();
+ rootObj = (NSObject) ((TikaInputStream) stream).getOpenContainer();
}
if (rootObj == null) {
@@ -110,14 +107,16 @@ public class PListParser extends AbstractParser {
} else {
rootObj = PropertyListParser.parse(stream);
}
- } catch (PropertyListFormatException | ParseException | ParserConfigurationException e) {
+ } catch (PropertyListFormatException | ParseException |
+ ParserConfigurationException e) {
throw new TikaException("problem parsing root", e);
}
}
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (BPListDetector.PLIST.toString().equals(contentType)) {
if (rootObj instanceof NSDictionary) {
- MediaType subtype = BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet());
+ MediaType subtype =
+ BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet());
metadata.set(Metadata.CONTENT_TYPE, subtype.toString());
}
}
@@ -130,13 +129,12 @@ public class PListParser extends AbstractParser {
xhtml.endDocument();
}
- private void parseObject(NSObject obj, State state)
- throws SAXException, IOException {
+ private void parseObject(NSObject obj, State state) throws SAXException, IOException {
if (obj instanceof NSDictionary) {
- parseDict((NSDictionary)obj, state);
+ parseDict((NSDictionary) obj, state);
} else if (obj instanceof NSArray) {
- NSArray nsArray = (NSArray)obj;
+ NSArray nsArray = (NSArray) obj;
state.xhtml.startElement(ARR);
for (NSObject child : nsArray.getArray()) {
parseObject(child, state);
@@ -144,7 +142,7 @@ public class PListParser extends AbstractParser {
state.xhtml.endElement(ARR);
} else if (obj instanceof NSString) {
state.xhtml.startElement(STRING);
- state.xhtml.characters(((NSString)obj).getContent());
+ state.xhtml.characters(((NSString) obj).getContent());
state.xhtml.endElement(STRING);
} else if (obj instanceof NSNumber) {
state.xhtml.startElement(NUMBER);
@@ -156,25 +154,24 @@ public class PListParser extends AbstractParser {
state.xhtml.endElement(DATA);
} else if (obj instanceof NSDate) {
state.xhtml.startElement(DATE);
- String dateString = state.dateFormat.format(((NSDate)obj).getDate());
+ String dateString = state.dateFormat.format(((NSDate) obj).getDate());
state.xhtml.characters(dateString);
state.xhtml.endElement(DATE);
} else if (obj instanceof NSSet) {
state.xhtml.startElement(SET);
- parseSet((NSSet)obj, state);
+ parseSet((NSSet) obj, state);
state.xhtml.endElement(SET);
} else if (obj instanceof UID) {
//do we want to do anything with obj.getBytes()
- state.xhtml.element(UID, ((UID)obj).getName());
+ state.xhtml.element(UID, ((UID) obj).getName());
} else {
throw new UnsupportedOperationException(
- "don't yet support this type of object: "+obj.getClass() +
- " Please open an issue on our tracker");
+ "don't yet support this type of object: " + obj.getClass() +
+ " Please open an issue on our tracker");
}
}
- private void parseSet(NSSet obj, State state)
- throws SAXException, IOException {
+ private void parseSet(NSSet obj, State state) throws SAXException, IOException {
state.xhtml.startElement(SET);
for (NSObject child : obj.allObjects()) {
parseObject(child, state);
@@ -182,8 +179,7 @@ public class PListParser extends AbstractParser {
state.xhtml.endElement(SET);
}
- private void parseDict(NSDictionary obj, State state)
- throws SAXException, IOException {
+ private void parseDict(NSDictionary obj, State state) throws SAXException, IOException {
state.xhtml.startElement(DICT);
for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) {
String key = mapEntry.getKey();
@@ -194,16 +190,16 @@ public class PListParser extends AbstractParser {
state.xhtml.endElement(DICT);
}
- private void handleData(NSData value, State state) throws IOException,
- SAXException {
+ private void handleData(NSData value, State state) throws IOException, SAXException {
state.xhtml.characters(value.getBase64EncodedData());
Metadata embeddedMetadata = new Metadata();
- if (! state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ if (!state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
return;
}
try (TikaInputStream tis = TikaInputStream.get(value.bytes())) {
- state.embeddedDocumentExtractor.parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
+ state.embeddedDocumentExtractor
+ .parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
}
}
@@ -213,10 +209,8 @@ public class PListParser extends AbstractParser {
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final DateFormat dateFormat;
- public State(XHTMLContentHandler xhtml,
- Metadata metadata,
- EmbeddedDocumentExtractor embeddedDocumentExtractor,
- DateFormat df) {
+ public State(XHTMLContentHandler xhtml, Metadata metadata,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor, DateFormat df) {
this.xhtml = xhtml;
this.metadata = metadata;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
index 4143932..589824e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java
@@ -22,91 +22,85 @@ import java.util.Locale;
* Utility class to allow for conversion from an integer to Roman numerals
* or alpha-numeric symbols in line with Pages auto numbering formats.
*/
- class AutoPageNumberUtils {
-
- private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
- "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
- "U", "V", "W", "X", "Y", "Z" };
-
- private static final int MAX = 26;
-
- public static String asAlphaNumeric(int i) {
- StringBuffer sbuff = new StringBuffer();
- int index = i % MAX;
- int ratio = i / MAX;
-
- if (index == 0) {
- ratio--;
- index = MAX;
- }
-
- for(int j = 0; j <= ratio; j++) {
- sbuff.append(ALPHABET[index - 1]); }
- return sbuff.toString();
- }
-
- public static String asAlphaNumericLower(int i) {
- return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
- }
-
- /*
- * Code copied from jena.apache.org.
- * @see com.hp.hpl.jena.sparql.util.RomanNumeral
- */
+class AutoPageNumberUtils {
+
+ private static final String[] ALPHABET =
+ {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
+ "R", "S", "T", "U", "V", "W", "X", "Y", "Z"};
+
+ private static final int MAX = 26;
+
+ public static String asAlphaNumeric(int i) {
+ StringBuffer sbuff = new StringBuffer();
+ int index = i % MAX;
+ int ratio = i / MAX;
+
+ if (index == 0) {
+ ratio--;
+ index = MAX;
+ }
+
+ for (int j = 0; j <= ratio; j++) {
+ sbuff.append(ALPHABET[index - 1]);
+ }
+ return sbuff.toString();
+ }
+
+ public static String asAlphaNumericLower(int i) {
+ return asAlphaNumeric(i).toLowerCase(Locale.ROOT);
+ }
+
+ /*
+ * Code copied from jena.apache.org.
+ * @see com.hp.hpl.jena.sparql.util.RomanNumeral
+ */
public static String asRomanNumerals(int i) {
- if ( i <= 0 )
- throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
- if ( i > 3999 )
- throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
- StringBuffer sbuff = new StringBuffer() ;
-
- i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
- i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ;
- i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ;
-
- while ( i >= 1 )
- {
- sbuff.append("I") ;
- i -= 1 ;
+ if (i <= 0) {
+ throw new NumberFormatException("Roman numerals are 1-3999 (" + i + ")");
}
- return sbuff.toString() ;
-
-
+ if (i > 3999) {
+ throw new NumberFormatException("Roman numerals are 1-3999 (" + i + ")");
+ }
+ StringBuffer sbuff = new StringBuffer();
+
+ i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400);
+ i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40);
+ i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4);
+
+ while (i >= 1) {
+ sbuff.append("I");
+ i -= 1;
+ }
+ return sbuff.toString();
+
+
+ }
+
+ public static String asRomanNumeralsLower(int i) {
+ return asRomanNumerals(i).toLowerCase(Locale.ROOT);
}
-
- public static String asRomanNumeralsLower(int i) {
- return asRomanNumerals(i).toLowerCase(Locale.ROOT);
- }
-
- private static int i2r(StringBuffer sbuff, int i,
- String tens, int iTens,
- String nines, int iNines,
- String fives, int iFives,
- String fours, int iFours)
- {
- while ( i >= iTens )
- {
- sbuff.append(tens) ;
- i -= iTens ;
+
+ private static int i2r(StringBuffer sbuff, int i, String tens, int iTens, String nines,
+ int iNines, String fives, int iFives, String fours, int iFours) {
+ while (i >= iTens) {
+ sbuff.append(tens);
+ i -= iTens;
}
-
- if ( i >= iNines )
- {
- sbuff.append(nines) ;
+
+ if (i >= iNines) {
+ sbuff.append(nines);
i -= iNines;
}
- if ( i >= iFives )
- {
- sbuff.append(fives) ;
- i -= iFives ;
+ if (i >= iFives) {
+ sbuff.append(fives);
+ i -= iFives;
}
- if ( i >= iFours )
- {
- sbuff.append(fours) ;
- i -= iFours ;
+ if (i >= iFours) {
+ sbuff.append(fours);
+ i -= iFours;
}
- return i ;
+ return i;
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 2ffbf56..66130d1 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -16,11 +16,23 @@
*/
package org.apache.tika.parser.iwork;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import javax.xml.namespace.QName;
+
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -30,22 +42,11 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import javax.xml.namespace.QName;
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
/**
* A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
* This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
- *
+ * <p>
* Currently supported formats:
* <ol>
* <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
@@ -55,115 +56,33 @@ import java.util.Set;
*/
public class IWorkPackageParser extends AbstractParser {
- /** Serial version UID */
- private static final long serialVersionUID = -2160322853809682372L;
-
/**
* Which files within an iWork file contain the actual content?
*/
public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
- new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
- );
+ new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl")));
/**
* All iWork files contain one of these, so we can detect based on it
*/
public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
-
- public enum IWORKDocumentType {
- KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
- NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
- PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
- ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
-
- private final String namespace;
- private final String part;
- private final MediaType type;
-
- IWORKDocumentType(String namespace, String part, MediaType type) {
- this.namespace = namespace;
- this.part = part;
- this.type = type;
- }
-
- public String getNamespace() {
- return namespace;
- }
-
- public String getPart() {
- return part;
- }
-
- public MediaType getType() {
- return type;
- }
-
- public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
- try {
- if (entry == null) {
- return null;
- }
-
- try (InputStream stream = zip.getInputStream(entry)) {
- return detectType(stream);
- }
- } catch (IOException e) {
- return null;
- }
- }
-
- public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
- if (entry == null) {
- return null;
- }
-
- return detectType(zip);
- }
-
- public static IWORKDocumentType detectType(InputStream stream) {
- QName qname = new XmlRootExtractor().extractRootElement(stream);
- if (qname != null) {
- String uri = qname.getNamespaceURI();
- String local = qname.getLocalPart();
-
- for (IWORKDocumentType type : values()) {
- if(type.getNamespace().equals(uri) &&
- type.getPart().equals(local)) {
- return type;
- }
- }
- } else {
- // There was a problem with extracting the root type
- // Password Protected iWorks files are funny, but we can usually
- // spot them because they encrypt part of the zip stream
- try {
- stream.read();
- } catch(UnsupportedZipFeatureException e) {
- // Compression field was likely encrypted
- return ENCRYPTED;
- } catch(Exception ignored) {
- }
- }
- return null;
- }
- }
-
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -2160322853809682372L;
/**
* This parser handles all iWorks formats.
*/
- private final static Set<MediaType> supportedTypes =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.apple.iwork"),
- IWORKDocumentType.KEYNOTE.getType(),
- IWORKDocumentType.NUMBERS.getType(),
- IWORKDocumentType.PAGES.getType()
- )));
+ private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(
+ new HashSet<MediaType>(Arrays.asList(MediaType.application("vnd.apple.iwork"),
+ IWORKDocumentType.KEYNOTE.getType(), IWORKDocumentType.NUMBERS.getType(),
+ IWORKDocumentType.PAGES.getType())));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return supportedTypes;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
ZipArchiveEntry entry = zip.getNextZipEntry();
@@ -177,44 +96,122 @@ public class IWorkPackageParser extends AbstractParser {
entryStream.mark(4096);
IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
entryStream.reset();
-
- if(type != null) {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- ContentHandler contentHandler;
-
- switch(type) {
- case KEYNOTE:
- contentHandler = new KeynoteContentHandler(xhtml, metadata);
- break;
- case NUMBERS:
- contentHandler = new NumbersContentHandler(xhtml, metadata);
- break;
- case PAGES:
- contentHandler = new PagesContentHandler(xhtml, metadata);
- break;
- case ENCRYPTED:
- // We can't do anything for the file right now
- contentHandler = null;
- break;
- default:
- throw new TikaException("Unhandled iWorks file " + type);
- }
-
- metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
- xhtml.startDocument();
+
+ if (type != null) {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ ContentHandler contentHandler;
+
+ switch (type) {
+ case KEYNOTE:
+ contentHandler = new KeynoteContentHandler(xhtml, metadata);
+ break;
+ case NUMBERS:
+ contentHandler = new NumbersContentHandler(xhtml, metadata);
+ break;
+ case PAGES:
+ contentHandler = new PagesContentHandler(xhtml, metadata);
+ break;
+ case ENCRYPTED:
+ // We can't do anything for the file right now
+ contentHandler = null;
+ break;
+ default:
+ throw new TikaException("Unhandled iWorks file " + type);
+ }
+
+ metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
+ xhtml.startDocument();
if (contentHandler != null) {
- XMLReaderUtils.parseSAX(
- new CloseShieldInputStream(entryStream),
- new OfflineContentHandler(contentHandler),
- context
- );
+ XMLReaderUtils.parseSAX(new CloseShieldInputStream(entryStream),
+ new OfflineContentHandler(contentHandler), context);
}
- xhtml.endDocument();
+ xhtml.endDocument();
}
-
+
entry = zip.getNextZipEntry();
}
// Don't close the zip InputStream (TIKA-1117).
}
+ public enum IWORKDocumentType {
+ KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
+ MediaType.application("vnd.apple.keynote")),
+ NUMBERS("http://developer.apple.com/namespaces/ls", "document",
+ MediaType.application("vnd.apple.numbers")),
+ PAGES("http://developer.apple.com/namespaces/sl", "document",
+ MediaType.application("vnd.apple.pages")),
+ ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
+
+ private final String namespace;
+ private final String part;
+ private final MediaType type;
+
+ IWORKDocumentType(String namespace, String part, MediaType type) {
+ this.namespace = namespace;
+ this.part = part;
+ this.type = type;
+ }
+
+ public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
+ try {
+ if (entry == null) {
+ return null;
+ }
+
+ try (InputStream stream = zip.getInputStream(entry)) {
+ return detectType(stream);
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ public static IWORKDocumentType detectType(ZipArchiveEntry entry,
+ ZipArchiveInputStream zip) {
+ if (entry == null) {
+ return null;
+ }
+
+ return detectType(zip);
+ }
+
+ public static IWORKDocumentType detectType(InputStream stream) {
+ QName qname = new XmlRootExtractor().extractRootElement(stream);
+ if (qname != null) {
+ String uri = qname.getNamespaceURI();
+ String local = qname.getLocalPart();
+
+ for (IWORKDocumentType type : values()) {
+ if (type.getNamespace().equals(uri) && type.getPart().equals(local)) {
+ return type;
+ }
+ }
+ } else {
+ // There was a problem with extracting the root type
+ // Password Protected iWorks files are funny, but we can usually
+ // spot them because they encrypt part of the zip stream
+ try {
+ stream.read();
+ } catch (UnsupportedZipFeatureException e) {
+ // Compression field was likely encrypted
+ return ENCRYPTED;
+ } catch (Exception ignored) {
+ }
+ }
+ return null;
+ }
+
+ public String getNamespace() {
+ return namespace;
+ }
+
+ public String getPart() {
+ return part;
+ }
+
+ public MediaType getType() {
+ return type;
+ }
+ }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
index 40b3d60..a3ad3db 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.iwork;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
class KeynoteContentHandler extends DefaultHandler {
@@ -59,8 +60,7 @@ class KeynoteContentHandler extends DefaultHandler {
}
@Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if ("key:theme".equals(qName)) {
inTheme = true;
@@ -114,8 +114,7 @@ class KeynoteContentHandler extends DefaultHandler {
}
@Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
+ public void endElement(String uri, String localName, String qName) throws SAXException {
if ("key:theme".equals(qName)) {
inTheme = false;
} else if ("key:slide".equals(qName)) {
@@ -153,24 +152,23 @@ class KeynoteContentHandler extends DefaultHandler {
}
@Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
+ public void characters(char[] ch, int start, int length) throws SAXException {
if (inParsableText && inSlide && length != 0) {
xhtml.characters(ch, start, length);
}
}
private void parseTableData(String value) throws SAXException {
- if (currentColumn == 0) {
- xhtml.startElement("tr");
- }
- xhtml.element("td", value);
-
- currentColumn++;
- if (currentColumn.equals(numberOfColumns)) {
- xhtml.endElement("tr");
- currentColumn = 0;
- }
+ if (currentColumn == 0) {
+ xhtml.startElement("tr");
+ }
+ xhtml.element("td", value);
+
+ currentColumn++;
+ if (currentColumn.equals(numberOfColumns)) {
+ xhtml.endElement("tr");
+ currentColumn = 0;
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
index 2ee64be..84ff5c9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java
@@ -16,17 +16,18 @@
*/
package org.apache.tika.parser.iwork;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.HashMap;
-import java.util.Map;
class NumbersContentHandler extends DefaultHandler {
@@ -61,7 +62,8 @@ class NumbersContentHandler extends DefaultHandler {
}
@Override
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
if ("ls:workspace".equals(qName)) {
inSheet = true;
numberOfSheets++;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
index 30d582c..61bf64c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
@@ -16,56 +16,41 @@
*/
package org.apache.tika.parser.iwork;
-import org.apache.tika.Tika;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
class PagesContentHandler extends DefaultHandler {
+ private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
private final XHTMLContentHandler xhtml;
private final Metadata metadata;
-
- /** The (interesting) part of the document we're in. Should be more structured... */
- private enum DocumentPart {
- METADATA, PARSABLE_TEXT,
- HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
- FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
- FOOTNOTES, ANNOTATIONS;
- }
private DocumentPart inPart = null;
private boolean ghostText;
-
- private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-
private boolean parseProperty = false;
private int pageCount = 0;
private int slPageCount = 0;
-
private HeaderFooter headers = null;
private HeaderFooter footers = null;
- private Footnotes footnotes = null;
- private Annotations annotations = null;
-
- private Map<String, List<List<String>>> tableData =
- new HashMap<String, List<List<String>>>();
+ private Footnotes footnotes = null;
+ private Annotations annotations = null;
+ private Map<String, List<List<String>>> tableData = new HashMap<String, List<List<String>>>();
private String activeTableId;
private int numberOfColumns = 0;
private List<String> activeRow = new ArrayList<String>();
-
private String metaDataLocalName;
private String metaDataQName;
-
PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
@@ -81,17 +66,16 @@ class PagesContentHandler extends DefaultHandler {
}
@Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if (parseProperty) {
String value = parsePrimitiveElementValue(qName, attributes);
if (value != null) {
Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
- if(metaDataKey instanceof Property) {
- metadata.set((Property)metaDataKey, value);
+ if (metaDataKey instanceof Property) {
+ metadata.set((Property) metaDataKey, value);
} else {
- metadata.add((String)metaDataKey, value);
+ metadata.add((String) metaDataKey, value);
}
}
}
@@ -99,7 +83,7 @@ class PagesContentHandler extends DefaultHandler {
if ("sl:publication-info".equals(qName)) {
inPart = DocumentPart.METADATA;
} else if ("sf:metadata".equals(qName)) {
- inPart = DocumentPart.METADATA;
+ inPart = DocumentPart.METADATA;
} else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
if (pageCount > 0) {
doFooter();
@@ -113,10 +97,10 @@ class PagesContentHandler extends DefaultHandler {
}
doHeader();
} else if ("sf:p".equals(qName)) {
- if (pageCount+slPageCount > 0) {
- inPart = DocumentPart.PARSABLE_TEXT;
- xhtml.startElement("p");
- }
+ if (pageCount + slPageCount > 0) {
+ inPart = DocumentPart.PARSABLE_TEXT;
+ xhtml.startElement("p");
+ }
} else if ("sf:attachment".equals(qName)) {
String kind = attributes.getValue("sf:kind");
if ("tabular-attachment".equals(kind)) {
@@ -130,56 +114,55 @@ class PagesContentHandler extends DefaultHandler {
headers = new HeaderFooter(qName);
inPart = DocumentPart.HEADERS;
} else if ("sf:footers".equals(qName)) {
- footers = new HeaderFooter(qName);
- inPart = DocumentPart.FOOTERS;
+ footers = new HeaderFooter(qName);
+ inPart = DocumentPart.FOOTERS;
} else if ("sf:header".equals(qName)) {
inPart = headers.identifyPart(attributes.getValue("sf:name"));
} else if ("sf:footer".equals(qName)) {
- inPart = footers.identifyPart(attributes.getValue("sf:name"));
- } else if ("sf:page-number".equals(qName)) {
- if (inPart == DocumentPart.FOOTER_ODD
- || inPart == DocumentPart.FOOTER_FIRST
- || inPart == DocumentPart.FOOTER_EVEN) {
- // We are in a footer
- footers.hasAutoPageNumber = true;
- footers.autoPageNumberFormat = attributes.getValue("sf:format");
- } else {
- headers.hasAutoPageNumber = true;
- headers.autoPageNumberFormat = attributes.getValue("sf:format");
- }
-
- xhtml.characters(Integer.toString(this.pageCount));
+ inPart = footers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:page-number".equals(qName)) {
+ if (inPart == DocumentPart.FOOTER_ODD || inPart == DocumentPart.FOOTER_FIRST ||
+ inPart == DocumentPart.FOOTER_EVEN) {
+ // We are in a footer
+ footers.hasAutoPageNumber = true;
+ footers.autoPageNumberFormat = attributes.getValue("sf:format");
+ } else {
+ headers.hasAutoPageNumber = true;
+ headers.autoPageNumberFormat = attributes.getValue("sf:format");
+ }
+
+ xhtml.characters(Integer.toString(this.pageCount));
} else if ("sf:footnotes".equals(qName)) {
- footnotes = new Footnotes();
- inPart = DocumentPart.FOOTNOTES;
+ footnotes = new Footnotes();
+ inPart = DocumentPart.FOOTNOTES;
} else if ("sf:footnote-mark".equals(qName)) {
- footnotes.recordMark(attributes.getValue("sf:mark"));
+ footnotes.recordMark(attributes.getValue("sf:mark"));
} else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- // What about non auto-numbered?
- String footnoteMark = attributes.getValue("sf:autonumber");
- if (footnotes != null) {
- String footnoteText = footnotes.footnotes.get(footnoteMark);
- if (footnoteText != null) {
- xhtml.startElement("div", "style", "footnote");
- xhtml.characters("Footnote:" ); // As shown in Pages
- xhtml.characters(footnoteText);
- xhtml.endElement("div");
- }
- }
+ // What about non auto-numbered?
+ String footnoteMark = attributes.getValue("sf:autonumber");
+ if (footnotes != null) {
+ String footnoteText = footnotes.footnotes.get(footnoteMark);
+ if (footnoteText != null) {
+ xhtml.startElement("div", "style", "footnote");
+ xhtml.characters("Footnote:"); // As shown in Pages
+ xhtml.characters(footnoteText);
+ xhtml.endElement("div");
+ }
+ }
} else if ("sf:annotations".equals(qName)) {
- annotations = new Annotations();
- inPart = DocumentPart.ANNOTATIONS;
+ annotations = new Annotations();
+ inPart = DocumentPart.ANNOTATIONS;
} else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
- annotations.start(attributes.getValue("sf:target"));
+ annotations.start(attributes.getValue("sf:target"));
} else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- xhtml.startElement("div", "style", "annotated");
-
- String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
- if (annotationText != null) {
- xhtml.startElement("div", "style", "annotation");
- xhtml.characters(annotationText);
- xhtml.endElement("div");
- }
+ xhtml.startElement("div", "style", "annotated");
+
+ String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+ if (annotationText != null) {
+ xhtml.startElement("div", "style", "annotation");
+ xhtml.characters(annotationText);
+ xhtml.endElement("div");
+ }
} else if ("sf:ghost-text".equals(qName)) {
ghostText = true;
}
@@ -196,8 +179,7 @@ class PagesContentHandler extends DefaultHandler {
}
@Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
+ public void endElement(String uri, String localName, String qName) throws SAXException {
if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
metaDataLocalName = null;
parseProperty = false;
@@ -207,7 +189,7 @@ class PagesContentHandler extends DefaultHandler {
inPart = null;
} else if ("sf:metadata".equals(qName)) {
inPart = null;
- } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
+ } else if ("sf:p".equals(qName) && (pageCount + slPageCount) > 0) {
inPart = null;
xhtml.endElement("p");
} else if ("sf:attachment".equals(qName)) {
@@ -224,21 +206,37 @@ class PagesContentHandler extends DefaultHandler {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (length > 0) {
- if (inPart == DocumentPart.PARSABLE_TEXT) {
- if (!ghostText) {
- xhtml.characters(ch, start, length);
- }
- } else if(inPart != null) {
- String str = new String(ch, start, length);
- if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
- if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
- if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
- if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
- if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
- if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
- if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
- if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
- }
+ if (inPart == DocumentPart.PARSABLE_TEXT) {
+ if (!ghostText) {
+ xhtml.characters(ch, start, length);
+ }
+ } else if (inPart != null) {
+ String str = new String(ch, start, length);
+ if (inPart == DocumentPart.HEADER_FIRST) {
+ headers.defaultFirst = str;
+ }
+ if (inPart == DocumentPart.HEADER_EVEN) {
+ headers.defaultEven = str;
+ }
+ if (inPart == DocumentPart.HEADER_ODD) {
+ headers.defaultOdd = str;
+ }
+ if (inPart == DocumentPart.FOOTER_FIRST) {
+ footers.defaultFirst = str;
+ }
+ if (inPart == DocumentPart.FOOTER_EVEN) {
+ footers.defaultEven = str;
+ }
+ if (inPart == DocumentPart.FOOTER_ODD) {
+ footers.defaultOdd = str;
+ }
+ if (inPart == DocumentPart.FOOTNOTES) {
+ footnotes.text(str);
+ }
+ if (inPart == DocumentPart.ANNOTATIONS) {
+ annotations.text(str);
+ }
+ }
}
}
@@ -309,8 +307,7 @@ class PagesContentHandler extends DefaultHandler {
* value to be extracted
* @return the value of a primitive element
*/
- private String parsePrimitiveElementValue(
- String qName, Attributes attributes) {
+ private String parsePrimitiveElementValue(String qName, Attributes attributes) {
if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
return attributes.getValue("sfa:string");
} else if ("sl:number".equals(qName)) {
@@ -321,130 +318,157 @@ class PagesContentHandler extends DefaultHandler {
return null;
}
-
+
private void doHeader() throws SAXException {
- if (headers != null) {
- headers.output("header");
- }
+ if (headers != null) {
+ headers.output("header");
+ }
}
+
private void doFooter() throws SAXException {
- if (footers != null) {
- footers.output("footer");
- }
+ if (footers != null) {
+ footers.output("footer");
+ }
}
/**
- * Represents the Headers or Footers in a document
+ * The (interesting) part of the document we're in. Should be more structured...
*/
- private class HeaderFooter {
- private String type; // sf:headers or sf:footers
- private String defaultOdd;
- private String defaultEven;
- private String defaultFirst;
- private boolean hasAutoPageNumber;
- private String autoPageNumberFormat;
- // TODO Can there be custom ones?
-
- private HeaderFooter(String type) {
- this.type = type;
- }
- private DocumentPart identifyPart(String name) {
- if("SFWPDefaultOddHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_ODD;
- if("SFWPDefaultEvenHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_EVEN;
- if("SFWPDefaultFirstHeaderIdentifier".equals(name))
- return DocumentPart.HEADER_FIRST;
-
- if("SFWPDefaultOddFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_ODD;
- if("SFWPDefaultEvenFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_EVEN;
- if("SFWPDefaultFirstFooterIdentifier".equals(name))
- return DocumentPart.FOOTER_FIRST;
-
- return null;
- }
- private void output(String what) throws SAXException {
- String text = null;
- if (pageCount == 1 && defaultFirst != null) {
- text = defaultFirst;
- } else if (pageCount % 2 == 0 && defaultEven != null) {
- text = defaultEven;
- } else {
- text = defaultOdd;
- }
-
- if (text != null) {
- xhtml.startElement("div", "class", "header");
- xhtml.characters(text);
- if (hasAutoPageNumber) {
- if (autoPageNumberFormat == null) { // raw number
- xhtml.characters("\t" + pageCount);
- } else if (autoPageNumberFormat.equals("upper-roman")){
- xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
- } else if (autoPageNumberFormat.equals("lower-roman")){
- xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
- } else if (autoPageNumberFormat.equals("upper-alpha")){
- xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
- } else if (autoPageNumberFormat.equals("lower-alpha")){
- xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
- }
- }
- xhtml.endElement("div");
- }
- }
+ private enum DocumentPart {
+ METADATA, PARSABLE_TEXT, HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST, FOOTERS,
+ FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST, FOOTNOTES, ANNOTATIONS;
}
+
/**
* Represents Footnotes in a document. The way these work
- * in the file format isn't very clean...
+ * in the file format isn't very clean...
*/
private static class Footnotes {
- /** Mark -> Text */
- Map<String,String> footnotes = new HashMap<String, String>();
- String lastSeenMark = null;
-
- /**
- * Normally happens before the text of the mark
- */
- private void recordMark(String mark) {
- lastSeenMark = mark;
- }
- private void text(String text) {
- if (lastSeenMark != null) {
- if (footnotes.containsKey(lastSeenMark)) {
- text = footnotes.get(lastSeenMark) + text;
- }
- footnotes.put(lastSeenMark, text);
- }
- }
+ /**
+ * Mark -> Text
+ */
+ Map<String, String> footnotes = new HashMap<String, String>();
+ String lastSeenMark = null;
+
+ /**
+ * Normally happens before the text of the mark
+ */
+ private void recordMark(String mark) {
+ lastSeenMark = mark;
+ }
+
+ private void text(String text) {
+ if (lastSeenMark != null) {
+ if (footnotes.containsKey(lastSeenMark)) {
+ text = footnotes.get(lastSeenMark) + text;
+ }
+ footnotes.put(lastSeenMark, text);
+ }
+ }
+ }
+
+ /**
+ * Represents the Headers or Footers in a document
+ */
+ private class HeaderFooter {
+ private String type; // sf:headers or sf:footers
+ private String defaultOdd;
+ private String defaultEven;
+ private String defaultFirst;
+ private boolean hasAutoPageNumber;
+ private String autoPageNumberFormat;
+ // TODO Can there be custom ones?
+
+ private HeaderFooter(String type) {
+ this.type = type;
+ }
+
+ private DocumentPart identifyPart(String name) {
+ if ("SFWPDefaultOddHeaderIdentifier".equals(name)) {
+ return DocumentPart.HEADER_ODD;
+ }
+ if ("SFWPDefaultEvenHeaderIdentifier".equals(name)) {
+ return DocumentPart.HEADER_EVEN;
+ }
+ if ("SFWPDefaultFirstHeaderIdentifier".equals(name)) {
+ return DocumentPart.HEADER_FIRST;
+ }
+
+ if ("SFWPDefaultOddFooterIdentifier".equals(name)) {
+ return DocumentPart.FOOTER_ODD;
+ }
+ if ("SFWPDefaultEvenFooterIdentifier".equals(name)) {
+ return DocumentPart.FOOTER_EVEN;
+ }
+ if ("SFWPDefaultFirstFooterIdentifier".equals(name)) {
+ return DocumentPart.FOOTER_FIRST;
+ }
+
+ return null;
+ }
+
+ private void output(String what) throws SAXException {
+ String text = null;
+ if (pageCount == 1 && defaultFirst != null) {
+ text = defaultFirst;
+ } else if (pageCount % 2 == 0 && defaultEven != null) {
+ text = defaultEven;
+ } else {
+ text = defaultOdd;
+ }
+
+ if (text != null) {
+ xhtml.startElement("div", "class", "header");
+ xhtml.characters(text);
+ if (hasAutoPageNumber) {
+ if (autoPageNumberFormat == null) { // raw number
+ xhtml.characters("\t" + pageCount);
+ } else if (autoPageNumberFormat.equals("upper-roman")) {
+ xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-roman")) {
+ xhtml.characters(
+ "\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
+ } else if (autoPageNumberFormat.equals("upper-alpha")) {
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
+ } else if (autoPageNumberFormat.equals("lower-alpha")) {
+ xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
+ }
+ }
+ xhtml.endElement("div");
+ }
+ }
}
+
/**
* Represents Annotations in a document. We currently
- * just grab all the sf:p text in each one
+ * just grab all the sf:p text in each one
*/
private class Annotations {
- /** ID -> Text */
- Map<String,String> annotations = new HashMap<String, String>();
- String currentID = null;
- StringBuffer currentText = null;
-
- private void start(String id) {
- currentID = id;
- currentText = new StringBuffer();
- }
- private void text(String text) {
- if (text != null && text.length() > 0 && currentText != null) {
- currentText.append(text);
- }
- }
- private void end() {
- if (currentText.length() > 0) {
- annotations.put(currentID, currentText.toString());
- currentID = null;
- currentText = null;
- }
- }
+ /**
+ * ID -> Text
+ */
+ Map<String, String> annotations = new HashMap<String, String>();
+ String currentID = null;
+ StringBuffer currentText = null;
+
+ private void start(String id) {
+ currentID = id;
+ currentText = new StringBuffer();
+ }
+
+ private void text(String text) {
+ if (text != null && text.length() > 0 && currentText != null) {
+ currentText.append(text);
+ }
+ }
+
+ private void end() {
+ if (currentText.length() > 0) {
+ annotations.put(currentID, currentText.toString());
+ currentID = null;
+ currentText = null;
+ }
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index 07b91d2..2476b0a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -28,17 +28,77 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
public class IWork13PackageParser extends AbstractParser {
+ /**
+ * All iWork 13 files contain this, so we can detect based on it
+ */
+ public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
+ private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(
+ new HashSet<MediaType>(Arrays.asList(IWork13DocumentType.KEYNOTE13.getType(),
+ IWork13DocumentType.NUMBERS13.getType(),
+ IWork13DocumentType.PAGES13.getType())));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // For now, just detect
+ MediaType type = null;
+ if (zipFile != null) {
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ if (type == null) {
+ type = IWork13DocumentType.detectIfPossible(entry);
+ }
+ }
+ } else {
+ ZipEntry entry = zipStream.getNextEntry();
+ while (entry != null) {
+ if (type == null) {
+ type = IWork13DocumentType.detectIfPossible(entry);
+ }
+ entry = zipStream.getNextEntry();
+ }
+ }
+ if (type != null) {
+ metadata.add(Metadata.CONTENT_TYPE, type.toString());
+ }
+ }
+
public enum IWork13DocumentType {
KEYNOTE13(MediaType.application("vnd.apple.keynote.13")),
NUMBERS13(MediaType.application("vnd.apple.numbers.13")),
@@ -51,109 +111,51 @@ public class IWork13PackageParser extends AbstractParser {
this.mediaType = mediaType;
}
- public MediaType getType() {
- return mediaType;
- }
-
public static MediaType detect(ZipFile zipFile) {
- MediaType type = null;
- Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
- while (entries.hasMoreElements()) {
- ZipEntry entry = entries.nextElement();
- type = IWork13DocumentType.detectIfPossible(entry);
- if (type != null) return type;
- }
-
- // If we get here, we don't know what it is
- return UNKNOWN13.getType();
+ MediaType type = null;
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ type = IWork13DocumentType.detectIfPossible(entry);
+ if (type != null) {
+ return type;
+ }
+ }
+
+ // If we get here, we don't know what it is
+ return UNKNOWN13.getType();
}
-
+
/**
* @return Specific type if this identifies one, otherwise null
*/
public static MediaType detectIfPossible(ZipEntry entry) {
- String name = entry.getName();
- if (! name.endsWith(".iwa")) return null;
-
- // Is it a uniquely identifying filename?
- if (name.equals("Index/MasterSlide.iwa") ||
- name.startsWith("Index/MasterSlide-")) {
- return KEYNOTE13.getType();
- }
- if (name.equals("Index/Slide.iwa") ||
- name.startsWith("Index/Slide-")) {
- return KEYNOTE13.getType();
- }
-
- // Is it the main document?
- if (name.equals("Index/Document.iwa")) {
- // TODO Decode the snappy stream, and check for the Message Type
- // = 2 (TN::SheetArchive), it is a numbers file;
- // = 10000 (TP::DocumentArchive), that's a pages file
- }
-
- // Unknown
- return null;
+ String name = entry.getName();
+ if (!name.endsWith(".iwa")) {
+ return null;
+ }
+
+ // Is it a uniquely identifying filename?
+ if (name.equals("Index/MasterSlide.iwa") || name.startsWith("Index/MasterSlide-")) {
+ return KEYNOTE13.getType();
+ }
+ if (name.equals("Index/Slide.iwa") || name.startsWith("Index/Slide-")) {
+ return KEYNOTE13.getType();
+ }
+
+ // Is it the main document?
+ if (name.equals("Index/Document.iwa")) {
+ // TODO Decode the snappy stream, and check for the Message Type
+ // = 2 (TN::SheetArchive), it is a numbers file;
+ // = 10000 (TP::DocumentArchive), that's a pages file
+ }
+
+ // Unknown
+ return null;
}
- }
-
- /**
- * All iWork 13 files contain this, so we can detect based on it
- */
- public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist";
-
- private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- IWork13DocumentType.KEYNOTE13.getType(),
- IWork13DocumentType.NUMBERS13.getType(),
- IWork13DocumentType.PAGES13.getType()
- )));
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return supportedTypes;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- // Open the Zip stream
- // Use a File if we can, and an already open zip is even better
- ZipFile zipFile = null;
- ZipInputStream zipStream = null;
- if (stream instanceof TikaInputStream) {
- TikaInputStream tis = (TikaInputStream) stream;
- Object container = ((TikaInputStream) stream).getOpenContainer();
- if (container instanceof ZipFile) {
- zipFile = (ZipFile) container;
- } else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
- } else {
- zipStream = new ZipInputStream(stream);
- }
- } else {
- zipStream = new ZipInputStream(stream);
- }
-
- // For now, just detect
- MediaType type = null;
- if (zipFile != null) {
- Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
- while (entries.hasMoreElements()) {
- ZipEntry entry = entries.nextElement();
- if (type == null) {
- type = IWork13DocumentType.detectIfPossible(entry);
- }
- }
- } else {
- ZipEntry entry = zipStream.getNextEntry();
- while (entry != null) {
- if (type == null) {
- type = IWork13DocumentType.detectIfPossible(entry);
- }
- entry = zipStream.getNextEntry();
- }
- }
- if (type != null) {
- metadata.add(Metadata.CONTENT_TYPE, type.toString());
- }
+ public MediaType getType() {
+ return mediaType;
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
index 7d58fa0..a31e580 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java
@@ -17,16 +17,6 @@
package org.apache.tika.parser.iwork.iwana;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
@@ -37,12 +27,78 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
/**
* For now, this parser isn't even registered. It contains
* code that will detect the newer 2018 .keynote, .numbers, .pages files.
*/
public class IWork18PackageParser extends AbstractParser {
+ private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(
+ new HashSet<MediaType>(Arrays.asList(IWork18DocumentType.KEYNOTE18.getType(),
+ IWork18DocumentType.NUMBERS18.getType(),
+ IWork18DocumentType.PAGES18.getType())));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // For now, just detect
+ MediaType type = null;
+ if (zipFile != null) {
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ if (type == null) {
+ type = IWork18DocumentType.detectIfPossible(entry);
+ }
+ }
+ } else {
+ ZipEntry entry = zipStream.getNextEntry();
+ while (entry != null) {
+ if (type == null) {
+ type = IWork18DocumentType.detectIfPossible(entry);
+ }
+ entry = zipStream.getNextEntry();
+ }
+ }
+ if (type != null) {
+ metadata.add(Metadata.CONTENT_TYPE, type.toString());
+ }
+ }
+
public enum IWork18DocumentType {
KEYNOTE18(MediaType.application("vnd.apple.keynote.18")),
NUMBERS18(MediaType.application("vnd.apple.numbers.18")),
@@ -54,97 +110,43 @@ public class IWork18PackageParser extends AbstractParser {
this.mediaType = mediaType;
}
- public MediaType getType() {
- return mediaType;
- }
-
/**
- *
* @param zipFile
* @return mime if detected or null
*/
public static MediaType detect(ZipFile zipFile) {
- MediaType type = null;
- Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
- while (entries.hasMoreElements()) {
- ZipEntry entry = entries.nextElement();
- type = IWork18DocumentType.detectIfPossible(entry);
- if (type != null) return type;
- }
-
- // If we get here, we don't know what it is
- return null;
+ MediaType type = null;
+ Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ type = IWork18DocumentType.detectIfPossible(entry);
+ if (type != null) {
+ return type;
+ }
+ }
+
+ // If we get here, we don't know what it is
+ return null;
}
-
+
/**
* @return Specific type if this identifies one, otherwise null
*/
public static MediaType detectIfPossible(ZipEntry entry) {
- String name = entry.getName();
- if (name.endsWith(".numbers/Metadata/BuildVersionHistory.plist")) {
- return IWork18DocumentType.NUMBERS18.getType();
- } else if (name.endsWith(".pages/Metadata/BuildVersionHistory.plist")) {
- return IWork18DocumentType.PAGES18.getType();
- } else if (name.endsWith(".key/Metadata/BuildVersionHistory.plist")) {
+ String name = entry.getName();
+ if (name.endsWith(".numbers/Metadata/BuildVersionHistory.plist")) {
+ return IWork18DocumentType.NUMBERS18.getType();
+ } else if (name.endsWith(".pages/Metadata/BuildVersionHistory.plist")) {
+ return IWork18DocumentType.PAGES18.getType();
+ } else if (name.endsWith(".key/Metadata/BuildVersionHistory.plist")) {
return IWork18DocumentType.KEYNOTE18.getType();
- }
- // Unknown
- return null;
+ }
+ // Unknown
+ return null;
}
- }
-
- private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- IWork18DocumentType.KEYNOTE18.getType(),
- IWork18DocumentType.NUMBERS18.getType(),
- IWork18DocumentType.PAGES18.getType()
- )));
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return supportedTypes;
- }
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- // Open the Zip stream
- // Use a File if we can, and an already open zip is even better
- ZipFile zipFile = null;
- ZipInputStream zipStream = null;
- if (stream instanceof TikaInputStream) {
- TikaInputStream tis = (TikaInputStream) stream;
- Object container = ((TikaInputStream) stream).getOpenContainer();
- if (container instanceof ZipFile) {
- zipFile = (ZipFile) container;
- } else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
- } else {
- zipStream = new ZipInputStream(stream);
- }
- } else {
- zipStream = new ZipInputStream(stream);
- }
-
- // For now, just detect
- MediaType type = null;
- if (zipFile != null) {
- Enumeration<? extends ZipEntry> entries = zipFile.getEntries();
- while (entries.hasMoreElements()) {
- ZipEntry entry = entries.nextElement();
- if (type == null) {
- type = IWork18DocumentType.detectIfPossible(entry);
- }
- }
- } else {
- ZipEntry entry = zipStream.getNextEntry();
- while (entry != null) {
- if (type == null) {
- type = IWork18DocumentType.detectIfPossible(entry);
- }
- entry = zipStream.getNextEntry();
- }
- }
- if (type != null) {
- metadata.add(Metadata.CONTENT_TYPE, type.toString());
- }
+ public MediaType getType() {
+ return mediaType;
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
index 15f7a7c..35733cb 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java
@@ -16,14 +16,15 @@
*/
package org.apache.tika.parser.apple;
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
import java.util.List;
-import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
public class PListParserTest extends TikaTest {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
index 65e7121..283249c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.iwork;
import static org.junit.Assert.assertEquals;
-import org.apache.tika.parser.iwork.AutoPageNumberUtils;
import org.junit.Test;
/**
@@ -26,54 +25,54 @@ import org.junit.Test;
*/
public class AutoPageNumberUtilsTest {
- /**
- * Check upper-case alpha-numeric numbers are generated based on the
- * input page number.
- */
+ /**
+ * Check upper-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
@Test
- public void testAlphaUpper() {
- assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
- assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
- assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
- assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
- assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
- assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
- }
+ public void testAlphaUpper() {
+ assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
+ assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
+ assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
+ assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
+ assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
+ assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
+ }
- /**
- * Check lower-case alpha-numeric numbers are generated based on the
- * input page number.
- */
+ /**
+ * Check lower-case alpha-numeric numbers are generated based on the
+ * input page number.
+ */
@Test
- public void testAlphaLower() {
- assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
- assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
- assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
- assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
- assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
- assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
- }
+ public void testAlphaLower() {
+ assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
+ assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
+ assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
+ assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
+ assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
+ assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
+ }
- /**
- * Check upper-case Roman numerals numbers are generated based on the
- * input page number.
- */
+ /**
+ * Check upper-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
@Test
- public void testRomanUpper() {
- assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
- assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
- assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
- }
+ public void testRomanUpper() {
+ assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
+ assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
+ assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
+ }
- /**
- * Check lower-case Roman numerals numbers are generated based on the
- * input page number.
- */
+ /**
+ * Check lower-case Roman numerals numbers are generated based on the
+ * input page number.
+ */
@Test
- public void testRomanLower() {
- assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
- assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
- assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
- }
+ public void testRomanLower() {
+ assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
+ assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
+ assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
index d45e67c..3695cca 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.iwork;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -24,14 +23,15 @@ import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
/**
* Tests if the IWork parser parses the content and metadata properly of the supported formats.
@@ -68,12 +68,17 @@ public class IWorkParserTest extends TikaTest {
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
-// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+// assertTrue("Metadata not found in " + metadataKeys,
+// metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
// Check the metadata values
assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("3", metadata.get(Office.SLIDE_COUNT));
@@ -112,7 +117,8 @@ public class IWorkParserTest extends TikaTest {
@Test
public void testKeynoteBulletPoints() throws Exception {
String content = getText("testBulletPoints.key", iWorkParser);
- assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
+ assertTrue(content.replaceAll("\\s+", " ")
+ .contains("bullet point 1 bullet point 2 bullet point 3"));
}
// TIKA-923
@@ -141,13 +147,19 @@ public class IWorkParserTest extends TikaTest {
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.PAGE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.MODIFIED.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.LANGUAGE.getName()));
-
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Office.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.MODIFIED.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.LANGUAGE.getName()));
+
// Check the metadata values
assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
@@ -196,13 +208,19 @@ public class IWorkParserTest extends TikaTest {
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
List<String> metadataKeys = Arrays.asList(metadata.names());
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.PAGE_COUNT.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
- assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
-
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(Office.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+ assertTrue("Metadata not found in " + metadataKeys,
+ metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
// Check the metadata values
assertEquals("2", metadata.get(Office.PAGE_COUNT));
assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
@@ -228,22 +246,22 @@ public class IWorkParserTest extends TikaTest {
String content = getText("tableNames.numbers", iWorkParser);
assertContains("This is the main table", content);
}
-
+
@Test
public void testParseNumbersTableHeaders() throws Exception {
String content = getText("tableHeaders.numbers");
- for(int header = 1;header <= 5;header++) {
- assertContains("header" + header, content);
+ for (int header = 1; header <= 5; header++) {
+ assertContains("header" + header, content);
}
- for(int row = 1;row <= 3;row++) {
- assertContains("row" + row, content);
+ for (int row = 1; row <= 3; row++) {
+ assertContains("row" + row, content);
}
}
/**
* We don't currently support password protected Pages files, as
- * we don't know how the encryption works (it's not regular Zip
- * Encryption). See TIKA-903 for details
+ * we don't know how the encryption works (it's not regular Zip
+ * Encryption). See TIKA-903 for details
*/
@Test
public void testParsePagesPasswordProtected() throws Exception {
@@ -251,11 +269,11 @@ public class IWorkParserTest extends TikaTest {
Metadata metadata = new Metadata();
String content = getText("testPagesPwdProtected.pages", iWorkParser, metadata);
assertEquals("", content);
-
+
// Will have been identified as encrypted
assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
}
-
+
/**
* Check we get headers, footers and footnotes from Pages
*/
@@ -272,46 +290,46 @@ public class IWorkParserTest extends TikaTest {
assertContains("Both Pages 1.x", content); // P1
assertContains("understanding the Pages document", content); // P1
assertContains("should be page 2", content); // P2
-
+
// Check for headers, footers and footnotes
assertContains(header, content);
assertContains(footer, content);
assertContains(footer2, content);
assertContains(footnote, content);
}
-
+
/**
* Check we get upper-case Roman numerals within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersFootersRomanUpper() throws Exception {
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\tI";
- String footer2 = "THIS IS SOME FOOTER TEXT\tII";
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\tI";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tII";
- String content = getText("testPagesHeadersFootersRomanUpper.pages", iWorkParser);
+ String content = getText("testPagesHeadersFootersRomanUpper.pages", iWorkParser);
- // Check for headers, footers and footnotes
- assertContains(header, content);
- assertContains(footer, content);
- assertContains(footer2, content);
+ // Check for headers, footers and footnotes
+ assertContains(header, content);
+ assertContains(footer, content);
+ assertContains(footer2, content);
}
-
+
/**
* Check we get lower-case Roman numerals within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersFootersRomanLower() throws Exception {
- String header = "THIS IS SOME HEADER TEXT";
- String footer = "THIS IS SOME FOOTER TEXT\ti";
- String footer2 = "THIS IS SOME FOOTER TEXT\tii";
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\ti";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tii";
- String content = getText("testPagesHeadersFootersRomanLower.pages", iWorkParser);
+ String content = getText("testPagesHeadersFootersRomanLower.pages", iWorkParser);
- // Check for headers, footers and footnotes
- assertContains(header, content);
- assertContains(footer, content);
- assertContains(footer2, content);
+ // Check for headers, footers and footnotes
+ assertContains(header, content);
+ assertContains(footer, content);
+ assertContains(footer2, content);
}
/**
@@ -330,7 +348,7 @@ public class IWorkParserTest extends TikaTest {
assertContains(footer, content);
assertContains(footer2, content);
}
-
+
/**
* Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
*/
@@ -347,7 +365,7 @@ public class IWorkParserTest extends TikaTest {
assertContains(footer, content);
assertContains(footer2, content);
}
-
+
/**
* Check we get annotations (eg comments) from Pages
*/
@@ -362,12 +380,12 @@ public class IWorkParserTest extends TikaTest {
assertContains("Both Pages 1.x", content); // P1
assertContains("understanding the Pages document", content); // P1
assertContains("should be page 2", content); // P2
-
+
// Check for comments
assertContains(commentA, content);
assertContains(commentB, content);
}
-
+
// TIKA-918
@Test
public void testNumbersExtractChartNames() throws Exception {
@@ -380,10 +398,11 @@ public class IWorkParserTest extends TikaTest {
//TIKA-3020
@Test
public void testKeyNoteTableMarkup() throws Exception {
- String expected = "<table><tr>\t<td>Cell one</td>\t<td>Cell two</td>\t<td>Cell three</td></tr>" +
- "<tr>\t<td>Cell four</td>\t<td>Cell 5</td>\t<td>Cell six</td></tr>" +
- "<tr>\t<td>7</td>\t<td>Cell eight</td>\t<td>5/5/1985</td></tr>" +
- "</table>";
+ String expected =
+ "<table><tr>\t<td>Cell one</td>\t<td>Cell two</td>\t<td>Cell three</td></tr>" +
+ "<tr>\t<td>Cell four</td>\t<td>Cell 5</td>\t<td>Cell six</td></tr>" +
+ "<tr>\t<td>7</td>\t<td>Cell eight</td>\t<td>5/5/1985</td></tr>" +
+ "</table>";
String xml = getXML("testKeynote.key", iWorkParser).xml;
xml = xml.replaceAll("[\r\n]", "");
assertContains(expected, xml);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
index 0246fd7..93b1f28 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java
@@ -20,18 +20,19 @@ import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
/**
* Limited testing for the iWorks 13 format parser, which
- * currently doesn't do anything more than detection....
+ * currently doesn't do anything more than detection....
*/
public class IWork13ParserTest extends TikaTest {
private IWork13PackageParser iWorkParser;
@@ -43,42 +44,42 @@ public class IWork13ParserTest extends TikaTest {
parseContext = new ParseContext();
parseContext.set(Parser.class, AUTO_DETECT_PARSER);
}
-
+
@Test
public void testParseKeynote13() throws Exception {
InputStream input = getResourceAsStream("/test-documents/testKeynote2013.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
-
+
// Currently parsing is a no-op, so will only get the Type
assertEquals(1, metadata.size());
assertEquals("", handler.toString());
assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(),
- metadata.get(Metadata.CONTENT_TYPE));
+ metadata.get(Metadata.CONTENT_TYPE));
}
-
+
@Test
public void testParseNumbers13() throws Exception {
InputStream input = getResourceAsStream("/test-documents/testNumbers2013.numbers");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
-
+
// Currently parsing is a no-op, and we can't get the type without
// decoding the Snappy stream
// TODO Test properly when a full Parser is added
assertEquals(0, metadata.size());
assertEquals("", handler.toString());
}
-
+
@Test
public void testParsePages13() throws Exception {
InputStream input = getResourceAsStream("/test-documents/testPages2013.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
-
+
// Currently parsing is a no-op, and we can't get the type without
// decoding the Snappy stream
// TODO Test properly when a full Parser is added
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
index 3acb0b6..1c3f4b9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java
@@ -25,7 +25,6 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
-
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFileFormat.Type;
import javax.sound.sampled.AudioFormat;
@@ -34,6 +33,9 @@ import javax.sound.sampled.UnsupportedAudioFileException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ProxyInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.XMPDM;
@@ -41,29 +43,27 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
public class AudioParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -6015684081240882695L;
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.audio("basic"),
- MediaType.audio("vnd.wave"), // Official, fixed in Tika 1.16
- MediaType.audio("x-wav"), // Older, used until Tika 1.16
- MediaType.audio("x-aiff"))));
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<MediaType>(
+ Arrays.asList(MediaType.audio("basic"), MediaType.audio("vnd.wave"),
+ // Official, fixed in Tika 1.16
+ MediaType.audio("x-wav"), // Older, used until Tika 1.16
+ MediaType.audio("x-aiff"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
// AudioSystem expects the stream to support the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
@@ -89,9 +89,7 @@ public class AudioParser extends AbstractParser {
float rate = audioFormat.getSampleRate();
if (rate != AudioSystem.NOT_SPECIFIED) {
metadata.set("samplerate", String.valueOf(rate));
- metadata.set(
- XMPDM.AUDIO_SAMPLE_RATE,
- Integer.toString((int) rate));
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE, Integer.toString((int) rate));
}
int bits = audioFormat.getSampleSizeInBits();
if (bits != AudioSystem.NOT_SPECIFIED) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
index 859d3bd..a7ebf7a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java
@@ -18,13 +18,6 @@ package org.apache.tika.parser.audio;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import javax.sound.midi.InvalidMidiDataException;
-import javax.sound.midi.MetaMessage;
-import javax.sound.midi.MidiMessage;
-import javax.sound.midi.MidiSystem;
-import javax.sound.midi.Patch;
-import javax.sound.midi.Sequence;
-import javax.sound.midi.Track;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -32,6 +25,16 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
+import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -39,34 +42,31 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
public class MidiParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 6343278584336189432L;
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("x-midi"),
- MediaType.audio("midi"))));
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<MediaType>(
+ Arrays.asList(MediaType.application("x-midi"), MediaType.audio("midi"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// MidiSystem expects the stream to support the mark feature
- if (! stream.markSupported()) {
+ if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
try {
@@ -103,8 +103,7 @@ public class MidiParser extends AbstractParser {
// Types 1-15 are reserved for text events
if (meta.getType() >= 1 && meta.getType() <= 15) {
// FIXME: What's the encoding?
- xhtml.characters(
- new String(meta.getData(), ISO_8859_1));
+ xhtml.characters(new String(meta.getData(), ISO_8859_1));
}
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
index 97712f1..c9a9b21 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
@@ -19,119 +19,54 @@ package org.apache.tika.parser.mp3;
import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.exception.TikaException;
+
/**
* An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
* Currently, only the header is processed, not the raw audio data.
*/
public class AudioFrame implements MP3Frame {
- /** Constant for the MPEG version 1. */
- public static final int MPEG_V1 = 3;
-
- /** Constant for the MPEG version 2. */
- public static final int MPEG_V2 = 2;
-
- /** Constant for the MPEG version 2.5. */
- public static final int MPEG_V2_5 = 0;
-
- /** Constant for audio layer 1. */
- public static final int LAYER_1 = 3;
-
- /** Constant for audio layer 2. */
- public static final int LAYER_2 = 2;
-
- /** Constant for audio layer 3. */
- public static final int LAYER_3 = 1;
-
- private final String version;
- private final int versionCode;
- private final int layer;
- private final int sampleRate;
- private final int channels;
- private final int bitRate;
- private final int length;
- private final float duration;
-
- public String getVersion() {
- return version;
- }
-
/**
- * Get the sampling rate, in Hz
+ * Constant for the MPEG version 1.
*/
- public int getSampleRate() {
- return sampleRate;
- }
-
- /**
- * Get the number of channels (1=mono, 2=stereo)
- */
- public int getChannels() {
- return channels;
- }
+ public static final int MPEG_V1 = 3;
/**
- * Get the version code.
- * @return the version code (one of the {@code MPEG} constants)
+ * Constant for the MPEG version 2.
*/
- public int getVersionCode()
- {
- return versionCode;
- }
+ public static final int MPEG_V2 = 2;
/**
- * Get the audio layer code.
- * @return the audio layer (one of the {@code LAYER} constants)
+ * Constant for the MPEG version 2.5.
*/
- public int getLayer()
- {
- return layer;
- }
+ public static final int MPEG_V2_5 = 0;
/**
- * Get the bit rate in bit per second.
- * @return the bit rate
+ * Constant for audio layer 1.
*/
- public int getBitRate()
- {
- return bitRate;
- }
+ public static final int LAYER_1 = 3;
/**
- * Returns the frame length in bytes.
- * @return the frame length
+ * Constant for audio layer 2.
*/
- public int getLength()
- {
- return length;
- }
+ public static final int LAYER_2 = 2;
/**
- * Returns the duration in milliseconds.
- * @return the duration
+ * Constant for audio layer 3.
*/
- public float getDuration()
- {
- return duration;
- }
+ public static final int LAYER_3 = 1;
- /**
- * Does this appear to be a 4 byte audio frame header?
- */
- public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
- if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
- return false;
- }
- // Check for the magic 11 bits set at the start
- // Note - doesn't do a CRC check
- if (h1 == 0xff && (h2 & 0x60) == 0x60) {
- return true;
- }
- return false;
- }
+ private final String version;
+ private final int versionCode;
+ private final int layer;
+ private final int sampleRate;
+ private final int channels;
+ private final int bitRate;
+ private final int length;
+ private final float duration;
/**
* @deprecated Use the constructor which is passed all values directly.
@@ -146,8 +81,7 @@ public class AudioFrame implements MP3Frame {
* @deprecated Use the constructor which is passed all values directly.
*/
@Deprecated
- public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
- throws IOException {
+ public AudioFrame(int h1, int h2, int h3, int h4, InputStream in) throws IOException {
if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
h1 = in.read();
h2 = in.read();
@@ -163,18 +97,18 @@ public class AudioFrame implements MP3Frame {
int rateCode = (h3 >> 2) & 0x03;
int rate;
switch (rateCode) {
- case 0:
- rate = 11025;
- break;
- case 1:
- rate = 12000;
- break;
- default:
- rate = 8000;
+ case 0:
+ rate = 11025;
+ break;
+ case 1:
+ rate = 12000;
+ break;
+ default:
+ rate = 8000;
}
if (versionCode == MPEG_V2) {
rate *= 2;
- } else if(versionCode == MPEG_V1) {
+ } else if (versionCode == MPEG_V1) {
rate *= 4;
}
sampleRate = rate;
@@ -193,20 +127,20 @@ public class AudioFrame implements MP3Frame {
throw new IllegalArgumentException("Magic Audio Frame Header not found");
}
}
-
+
/**
- *
* Creates a new instance of {@code AudioFrame} and initializes all properties.
+ *
* @param mpegVersion the code for the MPEG version
- * @param layer the code for the layer
- * @param bitRate the bit rate (in bps)
- * @param sampleRate the sample rate (in samples per second)
- * @param channels the number of channels
- * @param length the frame length (in bytes)
- * @param duration the duration of this frame (in milliseconds)
- */
- public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
- int channels, int length, float duration) {
+ * @param layer the code for the layer
+ * @param bitRate the bit rate (in bps)
+ * @param sampleRate the sample rate (in samples per second)
+ * @param channels the number of channels
+ * @param length the frame length (in bytes)
+ * @param duration the duration of this frame (in milliseconds)
+ */
+ public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate, int channels,
+ int length, float duration) {
versionCode = mpegVersion;
this.layer = layer;
this.bitRate = bitRate;
@@ -218,9 +152,25 @@ public class AudioFrame implements MP3Frame {
}
/**
+ * Does this appear to be a 4 byte audio frame header?
+ */
+ public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+ if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+ return false;
+ }
+ // Check for the magic 11 bits set at the start
+ // Note - doesn't do a CRC check
+ if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
* Generates a string for the version of this audio frame.
+ *
* @param version the code for the MPEG version
- * @param layer the code for the layer
+ * @param layer the code for the layer
* @return a string for the version
*/
private static String generateVersionStr(int version, int layer) {
@@ -239,14 +189,77 @@ public class AudioFrame implements MP3Frame {
buf.append(" Version ");
if (version == MPEG_V2_5) {
buf.append("2.5");
- } else if(version == MPEG_V2) {
+ } else if (version == MPEG_V2) {
buf.append("2");
- } else if(version == MPEG_V1) {
+ } else if (version == MPEG_V1) {
buf.append("1");
} else {
buf.append("(reserved)");
}
-
+
return buf.toString();
}
+
+ public String getVersion() {
+ return version;
+ }
+
+ /**
+ * Get the sampling rate, in Hz
+ */
+ public int getSampleRate() {
+ return sampleRate;
+ }
+
+ /**
+ * Get the number of channels (1=mono, 2=stereo)
+ */
+ public int getChannels() {
+ return channels;
+ }
+
+ /**
+ * Get the version code.
+ *
+ * @return the version code (one of the {@code MPEG} constants)
+ */
+ public int getVersionCode() {
+ return versionCode;
+ }
+
+ /**
+ * Get the audio layer code.
+ *
+ * @return the audio layer (one of the {@code LAYER} constants)
+ */
+ public int getLayer() {
+ return layer;
+ }
+
+ /**
+ * Get the bit rate in bit per second.
+ *
+ * @return the bit rate
+ */
+ public int getBitRate() {
+ return bitRate;
+ }
+
+ /**
+ * Returns the frame length in bytes.
+ *
+ * @return the frame length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Returns the duration in milliseconds.
+ *
+ * @return the duration
+ */
+ public float getDuration() {
+ return duration;
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
index b8d723f..9217156 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
@@ -20,149 +20,148 @@ import java.util.List;
/**
* Interface that defines the common interface for ID3 tag parsers,
- * such as ID3v1 and ID3v2.3.
+ * such as ID3v1 and ID3v2.3.
* Implementations should return NULL if the file lacks a given
- * tag, or if the tag isn't defined for the version.
- *
+ * tag, or if the tag isn't defined for the version.
+ * <p>
* Note that so far, only the ID3v1 core tags are listed here. In
- * future, we may wish to add more to cover the extra tags that
- * our ID3v2 handlers can produce.
+ * future, we may wish to add more to cover the extra tags that
+ * our ID3v2 handlers can produce.
*/
public interface ID3Tags {
/**
* List of predefined genres.
- *
+ * <p>
* See <a href="http://www.id3.org/id3v2-00">http://www.id3.org/id3v2-00</a>
*/
- String[] GENRES = new String[] {
- /* 0 */ "Blues",
- /* 1 */ "Classic Rock",
- /* 2 */ "Country",
- /* 3 */ "Dance",
- /* 4 */ "Disco",
- /* 5 */ "Funk",
- /* 6 */ "Grunge",
- /* 7 */ "Hip-Hop",
- /* 8 */ "Jazz",
- /* 9 */ "Metal",
- /* 10 */ "New Age",
- /* 11 */ "Oldies",
- /* 12 */ "Other",
- /* 13 */ "Pop",
- /* 14 */ "R&B",
- /* 15 */ "Rap",
- /* 16 */ "Reggae",
- /* 17 */ "Rock",
- /* 18 */ "Techno",
- /* 19 */ "Industrial",
- /* 20 */ "Alternative",
- /* 21 */ "Ska",
- /* 22 */ "Death Metal",
- /* 23 */ "Pranks",
- /* 24 */ "Soundtrack",
- /* 25 */ "Euro-Techno",
- /* 26 */ "Ambient",
- /* 27 */ "Trip-Hop",
- /* 28 */ "Vocal",
- /* 29 */ "Jazz+Funk",
- /* 30 */ "Fusion",
- /* 31 */ "Trance",
- /* 32 */ "Classical",
- /* 33 */ "Instrumental",
- /* 34 */ "Acid",
- /* 35 */ "House",
- /* 36 */ "Game",
- /* 37 */ "Sound Clip",
- /* 38 */ "Gospel",
- /* 39 */ "Noise",
- /* 40 */ "AlternRock",
- /* 41 */ "Bass",
- /* 42 */ "Soul",
- /* 43 */ "Punk",
- /* 44 */ "Space",
- /* 45 */ "Meditative",
- /* 46 */ "Instrumental Pop",
- /* 47 */ "Instrumental Rock",
- /* 48 */ "Ethnic",
- /* 49 */ "Gothic",
- /* 50 */ "Darkwave",
- /* 51 */ "Techno-Industrial",
- /* 52 */ "Electronic",
- /* 53 */ "Pop-Folk",
- /* 54 */ "Eurodance",
- /* 55 */ "Dream",
- /* 56 */ "Southern Rock",
- /* 57 */ "Comedy",
- /* 58 */ "Cult",
- /* 59 */ "Gangsta",
- /* 60 */ "Top 40",
- /* 61 */ "Christian Rap",
- /* 62 */ "Pop/Funk",
- /* 63 */ "Jungle",
- /* 64 */ "Native American",
- /* 65 */ "Cabaret",
- /* 66 */ "New Wave",
- /* 67 */ "Psychadelic",
- /* 68 */ "Rave",
- /* 69 */ "Showtunes",
- /* 70 */ "Trailer",
- /* 71 */ "Lo-Fi",
- /* 72 */ "Tribal",
- /* 73 */ "Acid Punk",
- /* 74 */ "Acid Jazz",
- /* 75 */ "Polka",
- /* 76 */ "Retro",
- /* 77 */ "Musical",
- /* 78 */ "Rock & Roll",
- /* 79 */ "Hard Rock",
- /* 80 */ "Folk",
- /* 81 */ "Folk-Rock",
- /* 82 */ "National Folk",
- /* 83 */ "Swing",
- /* 84 */ "Fast Fusion",
- /* 85 */ "Bebob",
- /* 86 */ "Latin",
- /* 87 */ "Revival",
- /* 88 */ "Celtic",
- /* 89 */ "Bluegrass",
- /* 90 */ "Avantgarde",
- /* 91 */ "Gothic Rock",
- /* 92 */ "Progressive Rock",
- /* 93 */ "Psychedelic Rock",
- /* 94 */ "Symphonic Rock",
- /* 95 */ "Slow Rock",
- /* 96 */ "Big Band",
- /* 97 */ "Chorus",
- /* 98 */ "Easy Listening",
- /* 99 */ "Acoustic",
- /* 100 */ "Humour",
- /* 101 */ "Speech",
- /* 102 */ "Chanson",
- /* 103 */ "Opera",
- /* 104 */ "Chamber Music",
- /* 105 */ "Sonata",
- /* 106 */ "Symphony",
- /* 107 */ "Booty Bass",
- /* 108 */ "Primus",
- /* 109 */ "Porn Groove",
- /* 110 */ "Satire",
- /* 111 */ "Slow Jam",
- /* 112 */ "Club",
- /* 113 */ "Tango",
- /* 114 */ "Samba",
- /* 115 */ "Folklore",
- /* 116 */ "Ballad",
- /* 117 */ "Power Ballad",
- /* 118 */ "Rhythmic Soul",
- /* 119 */ "Freestyle",
- /* 120 */ "Duet",
- /* 121 */ "Punk Rock",
- /* 122 */ "Drum Solo",
- /* 123 */ "A capella",
- /* 124 */ "Euro-House",
- /* 125 */ "Dance Hall",
- /* sentinel */ ""
- };
+ String[] GENRES = new String[]{
+ /* 0 */ "Blues",
+ /* 1 */ "Classic Rock",
+ /* 2 */ "Country",
+ /* 3 */ "Dance",
+ /* 4 */ "Disco",
+ /* 5 */ "Funk",
+ /* 6 */ "Grunge",
+ /* 7 */ "Hip-Hop",
+ /* 8 */ "Jazz",
+ /* 9 */ "Metal",
+ /* 10 */ "New Age",
+ /* 11 */ "Oldies",
+ /* 12 */ "Other",
+ /* 13 */ "Pop",
+ /* 14 */ "R&B",
+ /* 15 */ "Rap",
+ /* 16 */ "Reggae",
+ /* 17 */ "Rock",
+ /* 18 */ "Techno",
+ /* 19 */ "Industrial",
+ /* 20 */ "Alternative",
+ /* 21 */ "Ska",
+ /* 22 */ "Death Metal",
+ /* 23 */ "Pranks",
+ /* 24 */ "Soundtrack",
+ /* 25 */ "Euro-Techno",
+ /* 26 */ "Ambient",
+ /* 27 */ "Trip-Hop",
+ /* 28 */ "Vocal",
+ /* 29 */ "Jazz+Funk",
+ /* 30 */ "Fusion",
+ /* 31 */ "Trance",
+ /* 32 */ "Classical",
+ /* 33 */ "Instrumental",
+ /* 34 */ "Acid",
+ /* 35 */ "House",
+ /* 36 */ "Game",
+ /* 37 */ "Sound Clip",
+ /* 38 */ "Gospel",
+ /* 39 */ "Noise",
+ /* 40 */ "AlternRock",
+ /* 41 */ "Bass",
+ /* 42 */ "Soul",
+ /* 43 */ "Punk",
+ /* 44 */ "Space",
+ /* 45 */ "Meditative",
+ /* 46 */ "Instrumental Pop",
+ /* 47 */ "Instrumental Rock",
+ /* 48 */ "Ethnic",
+ /* 49 */ "Gothic",
+ /* 50 */ "Darkwave",
+ /* 51 */ "Techno-Industrial",
+ /* 52 */ "Electronic",
+ /* 53 */ "Pop-Folk",
+ /* 54 */ "Eurodance",
+ /* 55 */ "Dream",
+ /* 56 */ "Southern Rock",
+ /* 57 */ "Comedy",
+ /* 58 */ "Cult",
+ /* 59 */ "Gangsta",
+ /* 60 */ "Top 40",
+ /* 61 */ "Christian Rap",
+ /* 62 */ "Pop/Funk",
+ /* 63 */ "Jungle",
+ /* 64 */ "Native American",
+ /* 65 */ "Cabaret",
+ /* 66 */ "New Wave",
+ /* 67 */ "Psychadelic",
+ /* 68 */ "Rave",
+ /* 69 */ "Showtunes",
+ /* 70 */ "Trailer",
+ /* 71 */ "Lo-Fi",
+ /* 72 */ "Tribal",
+ /* 73 */ "Acid Punk",
+ /* 74 */ "Acid Jazz",
+ /* 75 */ "Polka",
+ /* 76 */ "Retro",
+ /* 77 */ "Musical",
+ /* 78 */ "Rock & Roll",
+ /* 79 */ "Hard Rock",
+ /* 80 */ "Folk",
+ /* 81 */ "Folk-Rock",
+ /* 82 */ "National Folk",
+ /* 83 */ "Swing",
+ /* 84 */ "Fast Fusion",
+ /* 85 */ "Bebob",
+ /* 86 */ "Latin",
+ /* 87 */ "Revival",
+ /* 88 */ "Celtic",
+ /* 89 */ "Bluegrass",
+ /* 90 */ "Avantgarde",
+ /* 91 */ "Gothic Rock",
+ /* 92 */ "Progressive Rock",
+ /* 93 */ "Psychedelic Rock",
+ /* 94 */ "Symphonic Rock",
+ /* 95 */ "Slow Rock",
+ /* 96 */ "Big Band",
+ /* 97 */ "Chorus",
+ /* 98 */ "Easy Listening",
+ /* 99 */ "Acoustic",
+ /* 100 */ "Humour",
+ /* 101 */ "Speech",
+ /* 102 */ "Chanson",
+ /* 103 */ "Opera",
+ /* 104 */ "Chamber Music",
+ /* 105 */ "Sonata",
+ /* 106 */ "Symphony",
+ /* 107 */ "Booty Bass",
+ /* 108 */ "Primus",
+ /* 109 */ "Porn Groove",
+ /* 110 */ "Satire",
+ /* 111 */ "Slow Jam",
+ /* 112 */ "Club",
+ /* 113 */ "Tango",
+ /* 114 */ "Samba",
+ /* 115 */ "Folklore",
+ /* 116 */ "Ballad",
+ /* 117 */ "Power Ballad",
+ /* 118 */ "Rhythmic Soul",
+ /* 119 */ "Freestyle",
+ /* 120 */ "Duet",
+ /* 121 */ "Punk Rock",
+ /* 122 */ "Drum Solo",
+ /* 123 */ "A capella",
+ /* 124 */ "Euro-House",
+ /* 125 */ "Dance Hall",
+ /* sentinel */ ""};
/**
* Does the file contain this kind of tags?
@@ -182,15 +181,15 @@ public interface ID3Tags {
String getAlbumArtist();
String getAlbum();
-
+
String getComposer();
String getCompilation();
-
+
/**
* Retrieves the comments, if any.
- * Files may have more than one comment, but normally only
- * one with any language/description pair.
+ * Files may have more than one comment, but normally only
+ * one with any language/description pair.
*/
List<ID3Comment> getComments();
@@ -209,20 +208,21 @@ public interface ID3Tags {
String getDisc();
/**
- * Represents a comments in ID3 (especially ID3 v2), where are
- * made up of several parts
+ * Represents a comments in ID3 (especially ID3 v2), where are
+ * made up of several parts
*/
class ID3Comment {
private String language;
private String description;
private String text;
-
+
/**
* Creates an ID3 v1 style comment tag
*/
public ID3Comment(String id3v1Text) {
- this.text = id3v1Text;
+ this.text = id3v1Text;
}
+
/**
* Creates an ID3 v2 style comment tag
*/
@@ -236,19 +236,21 @@ public interface ID3Tags {
* Gets the language, if present
*/
public String getLanguage() {
- return language;
+ return language;
}
+
/**
* Gets the description, if present
*/
public String getDescription() {
- return description;
+ return description;
}
+
/**
* Gets the text, if present
*/
public String getText() {
- return text;
+ return text;
}
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
index 2111356..ca4803f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
@@ -16,24 +16,26 @@
*/
package org.apache.tika.parser.mp3;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
-import org.apache.tika.exception.TikaException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import org.apache.tika.exception.TikaException;
/**
- * This is used to parse ID3 Version 1 Tag information from an MP3 file,
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file,
* if available.
*
* @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
*/
public class ID3v1Handler implements ID3Tags {
+ boolean found = false;
private String title;
private String artist;
private String album;
@@ -42,8 +44,6 @@ public class ID3v1Handler implements ID3Tags {
private String genre;
private String trackNumber;
- boolean found = false;
-
public ID3v1Handler(InputStream stream, ContentHandler handler)
throws IOException, SAXException, TikaException {
this(LyricsHandler.getSuffix(stream, 128));
@@ -51,19 +51,18 @@ public class ID3v1Handler implements ID3Tags {
/**
* Creates from the last 128 bytes of a stream.
- * @param tagData Must be the last 128 bytes
+ *
+ * @param tagData Must be the last 128 bytes
*/
- protected ID3v1Handler(byte[] tagData)
- throws IOException, SAXException, TikaException {
- if (tagData.length == 128
- && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+ protected ID3v1Handler(byte[] tagData) throws IOException, SAXException, TikaException {
+ if (tagData.length == 128 && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
found = true;
title = getString(tagData, 3, 33);
artist = getString(tagData, 33, 63);
album = getString(tagData, 63, 93);
year = getString(tagData, 93, 97);
-
+
String commentStr = getString(tagData, 97, 127);
comment = new ID3Comment(commentStr);
@@ -80,6 +79,40 @@ public class ID3v1Handler implements ID3Tags {
}
}
+ /**
+ * Returns the identified ISO-8859-1 substring from the given byte buffer.
+ * The return value is the zero-terminated substring retrieved from
+ * between the given start and end positions in the given byte buffer.
+ * Extra whitespace (and control characters) from the beginning and the
+ * end of the substring is removed.
+ *
+ * @param buffer byte buffer
+ * @param start start index of the substring
+ * @param end end index of the substring
+ * @return the identified substring
+ * @throws TikaException if the ISO-8859-1 encoding is not available
+ */
+ private static String getString(byte[] buffer, int start, int end) throws TikaException {
+ // Find the zero byte that marks the end of the string
+ int zero = start;
+ while (zero < end && buffer[zero] != 0) {
+ zero++;
+ }
+
+ // Skip trailing whitespace
+ end = zero;
+ while (start < end && buffer[end - 1] <= ' ') {
+ end--;
+ }
+
+ // Skip leading whitespace
+ while (start < end && buffer[start] <= ' ') {
+ start++;
+ }
+
+ // Return the remaining substring
+ return new String(buffer, start, end - start, ISO_8859_1);
+ }
public boolean getTagsPresent() {
return found;
@@ -102,7 +135,7 @@ public class ID3v1Handler implements ID3Tags {
}
public List<ID3Comment> getComments() {
- return Arrays.asList(comment);
+ return Arrays.asList(comment);
}
public String getGenre() {
@@ -112,10 +145,10 @@ public class ID3v1Handler implements ID3Tags {
public String getTrackNumber() {
return trackNumber;
}
-
+
/**
* ID3v1 doesn't have composers,
- * so returns null;
+ * so returns null;
*/
public String getComposer() {
return null;
@@ -123,7 +156,7 @@ public class ID3v1Handler implements ID3Tags {
/**
* ID3v1 doesn't have album-wide artists,
- * so returns null;
+ * so returns null;
*/
public String getAlbumArtist() {
return null;
@@ -131,7 +164,7 @@ public class ID3v1Handler implements ID3Tags {
/**
* ID3v1 doesn't have disc numbers,
- * so returns null;
+ * so returns null;
*/
public String getDisc() {
return null;
@@ -139,45 +172,9 @@ public class ID3v1Handler implements ID3Tags {
/**
* ID3v1 doesn't have compilations,
- * so returns null;
+ * so returns null;
*/
public String getCompilation() {
return null;
}
-
- /**
- * Returns the identified ISO-8859-1 substring from the given byte buffer.
- * The return value is the zero-terminated substring retrieved from
- * between the given start and end positions in the given byte buffer.
- * Extra whitespace (and control characters) from the beginning and the
- * end of the substring is removed.
- *
- * @param buffer byte buffer
- * @param start start index of the substring
- * @param end end index of the substring
- * @return the identified substring
- * @throws TikaException if the ISO-8859-1 encoding is not available
- */
- private static String getString(byte[] buffer, int start, int end)
- throws TikaException {
- // Find the zero byte that marks the end of the string
- int zero = start;
- while (zero < end && buffer[zero] != 0) {
- zero++;
- }
-
- // Skip trailing whitespace
- end = zero;
- while (start < end && buffer[end - 1] <= ' ') {
- end--;
- }
-
- // Skip leading whitespace
- while (start < end && buffer[start] <= ' ') {
- start++;
- }
-
- // Return the remaining substring
- return new String(buffer, start, end - start, ISO_8859_1);
- }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
index 8d94c0b..0edd185 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
@@ -20,10 +20,11 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
/**
* This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
@@ -43,61 +44,61 @@ public class ID3v22Handler implements ID3Tags {
private String disc;
private List<ID3Comment> comments = new ArrayList<ID3Comment>();
- public ID3v22Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
+ public ID3v22Handler(ID3v2Frame frame) throws IOException, SAXException, TikaException {
RawTagIterator tags = new RawV22TagIterator(frame);
while (tags.hasNext()) {
RawTag tag = tags.next();
if (tag.name.equals("TT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
+ title = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TP1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
+ artist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TP2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TAL")) {
- album = getTagString(tag.data, 0, tag.data.length);
+ album = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TYE")) {
- year = getTagString(tag.data, 0, tag.data.length);
+ year = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
+ composer = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("COM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
+ comments.add(getComment(tag.data, 0, tag.data.length));
} else if (tag.name.equals("TRK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPA")) {
- disc = getTagString(tag.data, 0, tag.data.length);
+ disc = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCO")) {
- genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ genre = extractGenre(getTagString(tag.data, 0, tag.data.length));
+ }
+ }
+ }
+
+ protected static String extractGenre(String rawGenre) {
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open == -1 && close == -1) {
+ return rawGenre;
+ } else if (open < close) {
+ String genreStr = rawGenre.substring(0, open).trim();
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open + 1, close));
+ return ID3Tags.GENRES[genreID];
+ } catch (ArrayIndexOutOfBoundsException invalidNum) {
+ return genreStr;
+ } catch (NumberFormatException notANum) {
+ return genreStr;
}
+ } else {
+ return null;
}
}
private String getTagString(byte[] data, int offset, int length) {
return ID3v2Frame.getTagString(data, offset, length);
}
+
private ID3Comment getComment(byte[] data, int offset, int length) {
return ID3v2Frame.getComment(data, offset, length);
}
-
- protected static String extractGenre(String rawGenre) {
- int open = rawGenre.indexOf("(");
- int close = rawGenre.indexOf(")");
- if (open == -1 && close == -1) {
- return rawGenre;
- } else if (open < close) {
- String genreStr = rawGenre.substring(0, open).trim();
- try {
- int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
- return ID3Tags.GENRES[genreID];
- } catch(ArrayIndexOutOfBoundsException invalidNum) {
- return genreStr;
- } catch(NumberFormatException notANum) {
- return genreStr;
- }
- } else {
- return null;
- }
- }
public boolean getTagsPresent() {
return true;
@@ -118,7 +119,7 @@ public class ID3v22Handler implements ID3Tags {
public String getYear() {
return year;
}
-
+
public String getComposer() {
return composer;
}
@@ -145,7 +146,7 @@ public class ID3v22Handler implements ID3Tags {
/**
* ID3v22 doesn't have compilations,
- * so returns null;
+ * so returns null;
*/
public String getCompilation() {
return null;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
index 4b67eda..09cf05f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
@@ -20,10 +20,11 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
/**
* This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
@@ -44,33 +45,32 @@ public class ID3v23Handler implements ID3Tags {
private String compilation;
private List<ID3Comment> comments = new ArrayList<ID3Comment>();
- public ID3v23Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
+ public ID3v23Handler(ID3v2Frame frame) throws IOException, SAXException, TikaException {
RawTagIterator tags = new RawV23TagIterator(frame);
while (tags.hasNext()) {
RawTag tag = tags.next();
if (tag.name.equals("TIT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
+ title = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPE1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
+ artist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPE2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TALB")) {
- album = getTagString(tag.data, 0, tag.data.length);
+ album = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TYER")) {
- year = getTagString(tag.data, 0, tag.data.length);
+ year = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCOM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
+ composer = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("COMM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
+ comments.add(getComment(tag.data, 0, tag.data.length));
} else if (tag.name.equals("TRCK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPOS")) {
- disc = getTagString(tag.data, 0, tag.data.length);
+ disc = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCMP")) {
- compilation = getTagString(tag.data, 0, tag.data.length);
+ compilation = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCON")) {
- genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ genre = ID3v22Handler.extractGenre(getTagString(tag.data, 0, tag.data.length));
}
}
}
@@ -78,8 +78,9 @@ public class ID3v23Handler implements ID3Tags {
private String getTagString(byte[] data, int offset, int length) {
return ID3v2Frame.getTagString(data, offset, length);
}
+
private ID3Comment getComment(byte[] data, int offset, int length) {
- return ID3v2Frame.getComment(data, offset, length);
+ return ID3v2Frame.getComment(data, offset, length);
}
public boolean getTagsPresent() {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
index caba928..37b1266 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
@@ -20,10 +20,11 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
-import org.xml.sax.SAXException;
/**
* This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
@@ -45,37 +46,36 @@ public class ID3v24Handler implements ID3Tags {
private String compilation;
private List<ID3Comment> comments = new ArrayList<ID3Comment>();
- public ID3v24Handler(ID3v2Frame frame)
- throws IOException, SAXException, TikaException {
+ public ID3v24Handler(ID3v2Frame frame) throws IOException, SAXException, TikaException {
RawTagIterator tags = new RawV24TagIterator(frame);
while (tags.hasNext()) {
RawTag tag = tags.next();
if (tag.name.equals("TIT2")) {
- title = getTagString(tag.data, 0, tag.data.length);
+ title = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPE1")) {
- artist = getTagString(tag.data, 0, tag.data.length);
+ artist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPE2")) {
- albumArtist = getTagString(tag.data, 0, tag.data.length);
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TALB")) {
- album = getTagString(tag.data, 0, tag.data.length);
+ album = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TYER")) {
- year = getTagString(tag.data, 0, tag.data.length);
+ year = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TDRC")) {
- if(year == null) {
- year = getTagString(tag.data, 0, tag.data.length);
- }
+ if (year == null) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ }
} else if (tag.name.equals("TCOM")) {
- composer = getTagString(tag.data, 0, tag.data.length);
+ composer = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("COMM")) {
- comments.add( getComment(tag.data, 0, tag.data.length) );
+ comments.add(getComment(tag.data, 0, tag.data.length));
} else if (tag.name.equals("TRCK")) {
- trackNumber = getTagString(tag.data, 0, tag.data.length);
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TPOS")) {
- disc = getTagString(tag.data, 0, tag.data.length);
+ disc = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCMP")) {
- compilation = getTagString(tag.data, 0, tag.data.length);
+ compilation = getTagString(tag.data, 0, tag.data.length);
} else if (tag.name.equals("TCON")) {
- genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ genre = ID3v22Handler.extractGenre(getTagString(tag.data, 0, tag.data.length));
}
}
}
@@ -83,6 +83,7 @@ public class ID3v24Handler implements ID3Tags {
private String getTagString(byte[] data, int offset, int length) {
return ID3v2Frame.getTagString(data, offset, length);
}
+
private ID3Comment getComment(byte[] data, int offset, int length) {
return ID3v2Frame.getComment(data, offset, length);
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
index 7ddceda..b471886 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.mp3;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
@@ -24,69 +26,67 @@ import java.util.Iterator;
import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
/**
- * A frame of ID3v2 data, which is then passed to a handler to
+ * A frame of ID3v2 data, which is then passed to a handler to
* be turned into useful data.
*/
public class ID3v2Frame implements MP3Frame {
+ protected static final TextEncoding[] encodings =
+ new TextEncoding[]{new TextEncoding("ISO-8859-1", false),
+ new TextEncoding("UTF-16", true), // With BOM
+ new TextEncoding("UTF-16BE", true), // Without BOM
+ new TextEncoding("UTF-8", false)};
private static int MAX_RECORD_SIZE = 50_000_000;
-
private int majorVersion;
private int minorVersion;
private int flags;
private int length;
- /** Excludes the header size part */
+ /**
+ * Excludes the header size part
+ */
private byte[] extendedHeader;
private byte[] data;
- public static void setMaxRecordSize(int maxRecordSize) {
- MAX_RECORD_SIZE = maxRecordSize;
- }
-
- public int getMajorVersion() {
- return majorVersion;
- }
-
- public int getMinorVersion() {
- return minorVersion;
- }
+ private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp) throws IOException {
+ this.majorVersion = majorVersion;
+ this.minorVersion = minorVersion;
- public int getFlags() {
- return flags;
- }
+ // Get the flags and the length
+ flags = inp.read();
+ length = get7BitsInt(readFully(inp, 4), 0);
- public int getLength() {
- return length;
- }
+ // Do we have an extended header?
+ if ((flags & 0x02) == 0x02) {
+ int size = getInt(readFully(inp, 4));
+ extendedHeader = readFully(inp, size);
+ }
- public byte[] getExtendedHeader() {
- return extendedHeader;
+ // Get the frame's data, or at least as much
+ // of it as we could do
+ data = readFully(inp, length, false);
}
- public byte[] getData() {
- return data;
+ public static void setMaxRecordSize(int maxRecordSize) {
+ MAX_RECORD_SIZE = maxRecordSize;
}
/**
* Returns the next ID3v2 Frame in
- * the file, or null if the next batch of data
- * doesn't correspond to either an ID3v2 header.
+ * the file, or null if the next batch of data
+ * doesn't correspond to either an ID3v2 header.
* If no ID3v2 frame could be detected and the passed in input stream is a
* {@code PushbackInputStream}, the bytes read so far are pushed back so
* that they can be read again.
* ID3v2 Frames should come before all Audio ones.
*/
- public static MP3Frame createFrameIfPresent(InputStream inp)
- throws IOException {
+ public static MP3Frame createFrameIfPresent(InputStream inp) throws IOException {
int h1 = inp.read();
int h2 = inp.read();
int h3 = inp.read();
-
- // Is it an ID3v2 Frame?
- if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+
+ // Is it an ID3v2 Frame?
+ if (h1 == (int) 'I' && h2 == (int) 'D' && h3 == (int) '3') {
int majorVersion = inp.read();
int minorVersion = inp.read();
if (majorVersion == -1 || minorVersion == -1) {
@@ -104,107 +104,84 @@ public class ID3v2Frame implements MP3Frame {
/**
* Pushes bytes back into the stream if possible. This method is called if
* no ID3v2 header could be found at the current stream position.
- *
- * @param inp the input stream
+ *
+ * @param inp the input stream
* @param bytes the bytes to be pushed back
* @throws IOException if an error occurs
*/
- private static void pushBack(InputStream inp, int... bytes)
- throws IOException
- {
- if (inp instanceof PushbackInputStream)
- {
+ private static void pushBack(InputStream inp, int... bytes) throws IOException {
+ if (inp instanceof PushbackInputStream) {
byte[] buf = new byte[bytes.length];
- for (int i = 0; i < bytes.length; i++)
- {
+ for (int i = 0; i < bytes.length; i++) {
buf[i] = (byte) bytes[i];
}
((PushbackInputStream) inp).unread(buf);
}
}
- private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
- throws IOException {
- this.majorVersion = majorVersion;
- this.minorVersion = minorVersion;
-
- // Get the flags and the length
- flags = inp.read();
- length = get7BitsInt(readFully(inp, 4), 0);
-
- // Do we have an extended header?
- if ((flags & 0x02) == 0x02) {
- int size = getInt(readFully(inp, 4));
- extendedHeader = readFully(inp, size);
- }
-
- // Get the frame's data, or at least as much
- // of it as we could do
- data = readFully(inp, length, false);
- }
-
protected static int getInt(byte[] data) {
return getInt(data, 0);
}
protected static int getInt(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- int b2 = data[offset+2] & 0xFF;
- int b3 = data[offset+3] & 0xFF;
+ int b0 = data[offset + 0] & 0xFF;
+ int b1 = data[offset + 1] & 0xFF;
+ int b2 = data[offset + 2] & 0xFF;
+ int b3 = data[offset + 3] & 0xFF;
return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
}
protected static int getInt3(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- int b2 = data[offset+2] & 0xFF;
+ int b0 = data[offset + 0] & 0xFF;
+ int b1 = data[offset + 1] & 0xFF;
+ int b2 = data[offset + 2] & 0xFF;
return (b0 << 16) + (b1 << 8) + (b2 << 0);
}
protected static int getInt2(byte[] data, int offset) {
- int b0 = data[offset+0] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
+ int b0 = data[offset + 0] & 0xFF;
+ int b1 = data[offset + 1] & 0xFF;
return (b0 << 8) + (b1 << 0);
}
/**
* AKA a Synchsafe integer.
* 4 bytes hold a 28 bit number. The highest
- * bit in each byte is always 0 and always ignored.
+ * bit in each byte is always 0 and always ignored.
*/
protected static int get7BitsInt(byte[] data, int offset) {
- int b0 = data[offset+0] & 0x7F;
- int b1 = data[offset+1] & 0x7F;
- int b2 = data[offset+2] & 0x7F;
- int b3 = data[offset+3] & 0x7F;
+ int b0 = data[offset + 0] & 0x7F;
+ int b1 = data[offset + 1] & 0x7F;
+ int b2 = data[offset + 2] & 0x7F;
+ int b3 = data[offset + 3] & 0x7F;
return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
}
- protected static byte[] readFully(InputStream inp, int length)
- throws IOException {
- return readFully(inp, length, true);
+ protected static byte[] readFully(InputStream inp, int length) throws IOException {
+ return readFully(inp, length, true);
}
protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
throws IOException {
if (MAX_RECORD_SIZE > 0 && length > MAX_RECORD_SIZE) {
- throw new IOException("Record size ("+length+
- " bytes) is larger than the allowed record size: "+MAX_RECORD_SIZE);
+ throw new IOException(
+ "Record size (" + length + " bytes) is larger than the allowed record size: " +
+ MAX_RECORD_SIZE);
}
byte[] b = new byte[length];
int pos = 0;
int read;
while (pos < length) {
- read = inp.read(b, pos, length-pos);
+ read = inp.read(b, pos, length - pos);
if (read == -1) {
- if(shortDataIsFatal) {
- throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+ if (shortDataIsFatal) {
+ throw new IOException("Tried to read " + length + " bytes, but only " + pos +
+ " bytes present");
} else {
- // Give them what we found
- // TODO Log the short read
- return b;
+ // Give them what we found
+ // TODO Log the short read
+ return b;
}
}
pos += read;
@@ -212,25 +189,10 @@ public class ID3v2Frame implements MP3Frame {
return b;
}
-
- protected static class TextEncoding {
- public final boolean doubleByte;
- public final String encoding;
- private TextEncoding(String encoding, boolean doubleByte) {
- this.doubleByte = doubleByte;
- this.encoding = encoding;
- }
- }
- protected static final TextEncoding[] encodings = new TextEncoding[] {
- new TextEncoding("ISO-8859-1", false),
- new TextEncoding("UTF-16", true), // With BOM
- new TextEncoding("UTF-16BE", true), // Without BOM
- new TextEncoding("UTF-8", false)
- };
/**
* Returns the (possibly null padded) String at the given offset and
- * length. String encoding is held in the first byte;
+ * length. String encoding is held in the first byte;
*/
protected static String getTagString(byte[] data, int offset, int length) {
int actualLength = length;
@@ -250,16 +212,17 @@ public class ID3v2Frame implements MP3Frame {
actualLength--;
encoding = encodings[maybeEncodingFlag];
}
-
- // Trim off null termination / padding (as present)
- while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
- actualLength -= 2;
- }
- while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
- actualLength--;
+
+ // Trim off null termination / padding (as present)
+ while (encoding.doubleByte && actualLength >= 2 && data[offset + actualLength - 1] == 0 &&
+ data[offset + actualLength - 2] == 0) {
+ actualLength -= 2;
+ }
+ while (!encoding.doubleByte && actualLength >= 1 && data[offset + actualLength - 1] == 0) {
+ actualLength--;
}
if (actualLength == 0) {
- return "";
+ return "";
}
// TIKA-1024: If it's UTF-16 (with BOM) and all we
@@ -267,136 +230,130 @@ public class ID3v2Frame implements MP3Frame {
// (return empty string), because new String(..)
// gives different results on different JVMs
if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
- ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
- (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
- return "";
+ ((data[offset] == (byte) 0xff && data[offset + 1] == (byte) 0xfe) ||
+ (data[offset] == (byte) 0xfe && data[offset + 1] == (byte) 0xff))) {
+ return "";
}
try {
// Build the base string
return new String(data, offset, actualLength, encoding.encoding);
} catch (UnsupportedEncodingException e) {
- throw new RuntimeException(
- "Core encoding " + encoding.encoding + " is not available", e);
+ throw new RuntimeException("Core encoding " + encoding.encoding + " is not available",
+ e);
}
}
+
/**
* Builds up the ID3 comment, by parsing and extracting
- * the comment string parts from the given data.
+ * the comment string parts from the given data.
*/
protected static ID3Comment getComment(byte[] data, int offset, int length) {
- // Comments must have an encoding
- int encodingFlag = data[offset];
- if (encodingFlag >= 0 && encodingFlag < encodings.length) {
- // Good, valid flag
- } else {
- // Invalid string
- return null;
- }
-
- TextEncoding encoding = encodings[encodingFlag];
-
- // First is a 3 byte language
- String lang = getString(data, offset+1, 3);
-
- // After that we have [Desc]\0(\0)[Text]
- int descStart = offset+4;
- int textStart = -1;
- String description = null;
- String text = null;
-
- // Find where the description ends
- try {
- for (int i=descStart; i<offset+length; i++) {
- if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
- // Handle LE vs BE on low byte text
- if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
- i++;
+ // Comments must have an encoding
+ int encodingFlag = data[offset];
+ if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+ // Good, valid flag
+ } else {
+ // Invalid string
+ return null;
+ }
+
+ TextEncoding encoding = encodings[encodingFlag];
+
+ // First is a 3 byte language
+ String lang = getString(data, offset + 1, 3);
+
+ // After that we have [Desc]\0(\0)[Text]
+ int descStart = offset + 4;
+ int textStart = -1;
+ String description = null;
+ String text = null;
+
+ // Find where the description ends
+ try {
+ for (int i = descStart; i < offset + length; i++) {
+ if (encoding.doubleByte && data[i] == 0 && data[i + 1] == 0) {
+ // Handle LE vs BE on low byte text
+ if (i + 2 < offset + length && data[i + 1] == 0 && data[i + 2] == 0) {
+ i++;
+ }
+ textStart = i + 2;
+ description = new String(data, descStart, i - descStart, encoding.encoding);
+ break;
}
- textStart = i+2;
- description = new String(data, descStart, i-descStart, encoding.encoding);
- break;
- }
- if (!encoding.doubleByte && data[i]==0) {
- textStart = i+1;
- description = new String(data, descStart, i-descStart, encoding.encoding);
- break;
- }
- }
-
- // Did we find the end?
- if (textStart > -1) {
- text = new String(data, textStart, offset+length-textStart, encoding.encoding);
- } else {
- // Assume everything is the text
- text = new String(data, descStart, offset+length-descStart, encoding.encoding);
- }
-
- // Return
- return new ID3Comment(lang, description, text);
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(
- "Core encoding " + encoding.encoding + " is not available", e);
- }
+ if (!encoding.doubleByte && data[i] == 0) {
+ textStart = i + 1;
+ description = new String(data, descStart, i - descStart, encoding.encoding);
+ break;
+ }
+ }
+
+ // Did we find the end?
+ if (textStart > -1) {
+ text = new String(data, textStart, offset + length - textStart, encoding.encoding);
+ } else {
+ // Assume everything is the text
+ text = new String(data, descStart, offset + length - descStart, encoding.encoding);
+ }
+
+ // Return
+ return new ID3Comment(lang, description, text);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException("Core encoding " + encoding.encoding + " is not available",
+ e);
+ }
}
/**
* Returns the String at the given
- * offset and length. Strings are ISO-8859-1
+ * offset and length. Strings are ISO-8859-1
*/
protected static String getString(byte[] data, int offset, int length) {
return new String(data, offset, length, ISO_8859_1);
}
+ public int getMajorVersion() {
+ return majorVersion;
+ }
- /**
- * Iterates over id3v2 raw tags.
- * Create an instance of this that configures the
- * various length and multipliers.
- */
- protected class RawTagIterator implements Iterator<RawTag> {
- private int nameLength;
- private int sizeLength;
- private int sizeMultiplier;
- private int flagLength;
+ public int getMinorVersion() {
+ return minorVersion;
+ }
- private int offset = 0;
+ public int getFlags() {
+ return flags;
+ }
- protected RawTagIterator(
- int nameLength, int sizeLength, int sizeMultiplier,
- int flagLength) {
- this.nameLength = nameLength;
- this.sizeLength = sizeLength;
- this.sizeMultiplier = sizeMultiplier;
- this.flagLength = flagLength;
- }
+ public int getLength() {
+ return length;
+ }
- public boolean hasNext() {
- // Check for padding at the end
- return offset < data.length && data[offset] != 0;
- }
+ public byte[] getExtendedHeader() {
+ return extendedHeader;
+ }
- public RawTag next() {
- RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
- flagLength, data, offset);
- offset += tag.getSize();
- return tag;
- }
+ public byte[] getData() {
+ return data;
+ }
- public void remove() {
- }
+ protected static class TextEncoding {
+ public final boolean doubleByte;
+ public final String encoding;
+ private TextEncoding(String encoding, boolean doubleByte) {
+ this.doubleByte = doubleByte;
+ this.encoding = encoding;
+ }
}
protected static class RawTag {
- private int headerSize;
protected String name;
protected int flag;
protected byte[] data;
+ private int headerSize;
- private RawTag(
- int nameLength, int sizeLength, int sizeMultiplier,
- int flagLength, byte[] frameData, int offset) {
+ private RawTag(int nameLength, int sizeLength, int sizeMultiplier, int flagLength,
+ byte[] frameData, int offset) {
headerSize = nameLength + sizeLength + flagLength;
// Name, normally 3 or 4 bytes
@@ -405,24 +362,25 @@ public class ID3v2Frame implements MP3Frame {
// Size
int rawSize;
if (sizeLength == 3) {
- rawSize = getInt3(frameData, offset+nameLength);
+ rawSize = getInt3(frameData, offset + nameLength);
} else {
- rawSize = getInt(frameData, offset+nameLength);
+ rawSize = getInt(frameData, offset + nameLength);
}
int size = rawSize * sizeMultiplier;
// Flag
if (flagLength > 0) {
if (flagLength == 1) {
- flag = (int)frameData[offset+nameLength+sizeLength];
+ flag = (int) frameData[offset + nameLength + sizeLength];
} else {
- flag = getInt2(frameData, offset+nameLength+sizeLength);
+ flag = getInt2(frameData, offset + nameLength + sizeLength);
}
}
// Now data
- int copyFrom = offset+nameLength+sizeLength+flagLength;
- size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+ int copyFrom = offset + nameLength + sizeLength + flagLength;
+ size = Math.max(0, Math.min(size, frameData.length -
+ copyFrom)); // TIKA-1218, prevent negative size for malformed files.
data = new byte[size];
System.arraycopy(frameData, copyFrom, data, 0, size);
}
@@ -433,4 +391,42 @@ public class ID3v2Frame implements MP3Frame {
}
+ /**
+ * Iterates over id3v2 raw tags.
+ * Create an instance of this that configures the
+ * various length and multipliers.
+ */
+ protected class RawTagIterator implements Iterator<RawTag> {
+ private int nameLength;
+ private int sizeLength;
+ private int sizeMultiplier;
+ private int flagLength;
+
+ private int offset = 0;
+
+ protected RawTagIterator(int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength) {
+ this.nameLength = nameLength;
+ this.sizeLength = sizeLength;
+ this.sizeMultiplier = sizeMultiplier;
+ this.flagLength = flagLength;
+ }
+
+ public boolean hasNext() {
+ // Check for padding at the end
+ return offset < data.length && data[offset] != 0;
+ }
+
+ public RawTag next() {
+ RawTag tag =
+ new RawTag(nameLength, sizeLength, sizeMultiplier, flagLength, data, offset);
+ offset += tag.getSize();
+ return tag;
+ }
+
+ public void remove() {
+ }
+
+ }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
index cc8f6b6..2c6e3b7 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
@@ -16,19 +16,20 @@
*/
package org.apache.tika.parser.mp3;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_8;
+import org.apache.tika.exception.TikaException;
/**
* This is used to parse Lyrics3 tag information
- * from an MP3 file, if available.
+ * from an MP3 file, if available.
* Handles lyrics tags of up to 10kb in size.
* Will process any ID3v1 tag data if present.
* Ignores extended ID3v1 data in the lyrics block
@@ -42,64 +43,56 @@ public class LyricsHandler {
public LyricsHandler(InputStream stream, ContentHandler handler)
throws IOException, SAXException, TikaException {
- this(getSuffix(stream, 10240+128));
+ this(getSuffix(stream, 10240 + 128));
}
/**
* Looks for the Lyrics data, which will be
- * just before the ID3v1 data (if present),
- * and process it.
+ * just before the ID3v1 data (if present),
+ * and process it.
* Also sets things up for the ID3v1
- * processing if required.
+ * processing if required.
* Creates from the last 128 bytes of a stream.
*/
- protected LyricsHandler(byte[] tagData)
- throws IOException, SAXException, TikaException {
- if(tagData.length < 128) {
+ protected LyricsHandler(byte[] tagData) throws IOException, SAXException, TikaException {
+ if (tagData.length < 128) {
return;
}
// Is there ID3v1 data?
byte[] last128 = new byte[128];
- System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+ System.arraycopy(tagData, tagData.length - 128, last128, 0, 128);
id3v1 = new ID3v1Handler(last128);
- if(tagData.length < 137) {
+ if (tagData.length < 137) {
return;
}
// Are there lyrics? Look for the closing Lyrics tag
// at the end to decide if there is any
int lookat = tagData.length - 9;
- if(id3v1.found) {
+ if (id3v1.found) {
lookat -= 128;
}
- if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
- tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
- tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
- tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
- tagData[lookat+8] == '0') {
+ if (tagData[lookat + 0] == 'L' && tagData[lookat + 1] == 'Y' &&
+ tagData[lookat + 2] == 'R' && tagData[lookat + 3] == 'I' &&
+ tagData[lookat + 4] == 'C' && tagData[lookat + 5] == 'S' &&
+ tagData[lookat + 6] == '2' && tagData[lookat + 7] == '0' &&
+ tagData[lookat + 8] == '0') {
foundLyrics = true;
// The length (6 bytes) comes just before LYRICS200, and is the
// size including the LYRICSBEGIN but excluding the
// length+LYRICS200 at the end.
- int length = Integer.parseInt(
- new String(tagData, lookat-6, 6, UTF_8)
- );
+ int length = Integer.parseInt(new String(tagData, lookat - 6, 6, UTF_8));
- String lyrics = new String(
- tagData, lookat-length+5, length-11,
- US_ASCII
- );
+ String lyrics = new String(tagData, lookat - length + 5, length - 11, US_ASCII);
// Tags are a 3 letter code, 5 digit length, then data
int pos = 0;
- while(pos < lyrics.length()-8) {
- String tagName = lyrics.substring(pos, pos+3);
- int tagLen = Integer.parseInt(
- lyrics.substring(pos+3, pos+8)
- );
+ while (pos < lyrics.length() - 8) {
+ String tagName = lyrics.substring(pos, pos + 3);
+ int tagLen = Integer.parseInt(lyrics.substring(pos + 3, pos + 8));
if (tagLen < 1 || tagLen > lyrics.length()) {
//something went wrong
break;
@@ -107,7 +100,7 @@ public class LyricsHandler {
int startPos = pos + 8;
int endPos = startPos + tagLen;
- if(tagName.equals("LYR")) {
+ if (tagName.equals("LYR")) {
lyricsText = lyrics.substring(startPos, endPos);
}
@@ -116,26 +109,16 @@ public class LyricsHandler {
}
}
- public boolean hasID3v1() {
- if(id3v1 == null || id3v1.found == false) {
- return false;
- }
- return true;
- }
- public boolean hasLyrics() {
- return lyricsText != null && lyricsText.length() > 0;
- }
-
/**
* Reads and returns the last <code>length</code> bytes from the
* given stream.
+ *
* @param stream input stream
* @param length number of bytes from the end to read and return
* @return stream the <code>InputStream</code> to read from.
* @throws IOException if the stream could not be read from.
*/
- protected static byte[] getSuffix(InputStream stream, int length)
- throws IOException {
+ protected static byte[] getSuffix(InputStream stream, int length) throws IOException {
byte[] buffer = new byte[2 * length];
int bytesInBuffer = 0;
@@ -157,4 +140,15 @@ public class LyricsHandler {
System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
return result;
}
+
+ public boolean hasID3v1() {
+ if (id3v1 == null || id3v1.found == false) {
+ return false;
+ }
+ return true;
+ }
+
+ public boolean hasLyrics() {
+ return lyricsText != null && lyricsText.length() > 0;
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
index 923be8a..8ea7c40 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
@@ -19,7 +19,7 @@ package org.apache.tika.parser.mp3;
/**
* A frame in an MP3 file, such as ID3v2 Tags or some
- * audio.
+ * audio.
*/
public interface MP3Frame {
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
index 11a7d4b..e264789 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
@@ -23,6 +23,9 @@ import java.util.Collections;
import java.util.List;
import java.util.Set;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TailStream;
@@ -34,8 +37,6 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
@@ -47,21 +48,99 @@ import org.xml.sax.SAXException;
*/
public class Mp3Parser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 8537074922934844370L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.audio("mpeg"));
+ Collections.singleton(MediaType.audio("mpeg"));
+
+ /**
+ * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+ * for each supported set of tags.
+ */
+ protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ ID3v24Handler v24 = null;
+ ID3v23Handler v23 = null;
+ ID3v22Handler v22 = null;
+ ID3v1Handler v1 = null;
+ LyricsHandler lyrics = null;
+ AudioFrame firstAudio = null;
+
+ TailStream tailStream = new TailStream(stream, 10240 + 128);
+ MpegStream mpegStream = new MpegStream(tailStream);
+
+ // ID3v2 tags live at the start of the file
+ // You can apparently have several different ID3 tag blocks
+ // So, keep going until we don't find any more
+ MP3Frame f;
+ while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
+ if (f instanceof ID3v2Frame) {
+ ID3v2Frame id3F = (ID3v2Frame) f;
+ if (id3F.getMajorVersion() == 4) {
+ v24 = new ID3v24Handler(id3F);
+ } else if (id3F.getMajorVersion() == 3) {
+ v23 = new ID3v23Handler(id3F);
+ } else if (id3F.getMajorVersion() == 2) {
+ v22 = new ID3v22Handler(id3F);
+ }
+ }
+ }
+
+ // Now iterate over all audio frames in the file
+ AudioFrame frame = mpegStream.nextFrame();
+ float duration = 0;
+ boolean skipped = true;
+ while (frame != null && skipped) {
+ duration += frame.getDuration();
+ if (firstAudio == null) {
+ firstAudio = frame;
+ }
+ skipped = mpegStream.skipFrame();
+ if (skipped) {
+ frame = mpegStream.nextFrame();
+ }
+ }
+
+ // ID3v1 tags live at the end of the file
+ // Lyrics live just before ID3v1, at the end of the file
+ // Search for both (handlers seek to the end for us)
+ lyrics = new LyricsHandler(tailStream.getTail());
+ v1 = lyrics.id3v1;
+
+ // Go in order of preference
+ // Currently, that's newest to oldest
+ List<ID3Tags> tags = new ArrayList<ID3Tags>();
+
+ if (v24 != null && v24.getTagsPresent()) {
+ tags.add(v24);
+ }
+ if (v23 != null && v23.getTagsPresent()) {
+ tags.add(v23);
+ }
+ if (v22 != null && v22.getTagsPresent()) {
+ tags.add(v22);
+ }
+ if (v1 != null && v1.getTagsPresent()) {
+ tags.add(v1);
+ }
+
+ ID3TagsAndAudio ret = new ID3TagsAndAudio();
+ ret.audio = firstAudio;
+ ret.lyrics = lyrics;
+ ret.tags = tags.toArray(new ID3Tags[0]);
+ ret.duration = duration;
+ return ret;
+ }
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
@@ -73,7 +152,7 @@ public class Mp3Parser extends AbstractParser {
// Before we start on the XHTML output, process and store
// as much metadata as possible
if (audioAndTags.duration > 0) {
- metadata.set(XMPDM.DURATION, audioAndTags.durationSeconds());
+ metadata.set(XMPDM.DURATION, audioAndTags.durationSeconds());
}
if (audioAndTags.audio != null) {
@@ -81,16 +160,15 @@ public class Mp3Parser extends AbstractParser {
metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
metadata.set("version", audioAndTags.audio.getVersion());
- metadata.set(
- XMPDM.AUDIO_SAMPLE_RATE,
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE,
Integer.toString(audioAndTags.audio.getSampleRate()));
- if(audioAndTags.audio.getChannels() == 1) {
+ if (audioAndTags.audio.getChannels() == 1) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
- } else if(audioAndTags.audio.getChannels() == 2) {
+ } else if (audioAndTags.audio.getChannels() == 2) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
- } else if(audioAndTags.audio.getChannels() == 5) {
+ } else if (audioAndTags.audio.getChannels() == 5) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
- } else if(audioAndTags.audio.getChannels() == 7) {
+ } else if (audioAndTags.audio.getChannels() == 7) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
}
}
@@ -158,9 +236,9 @@ public class Mp3Parser extends AbstractParser {
}
if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
- xhtml.startElement("p", "class", "lyrics");
- xhtml.characters(audioAndTags.lyrics.lyricsText);
- xhtml.endElement("p");
+ xhtml.startElement("p", "class", "lyrics");
+ xhtml.characters(audioAndTags.lyrics.lyricsText);
+ xhtml.endElement("p");
}
xhtml.endDocument();
@@ -176,94 +254,14 @@ public class Mp3Parser extends AbstractParser {
ID3v2Frame.setMaxRecordSize(maxRecordSize);
}
- /**
- * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
- * for each supported set of tags.
- */
- protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- ID3v24Handler v24 = null;
- ID3v23Handler v23 = null;
- ID3v22Handler v22 = null;
- ID3v1Handler v1 = null;
- LyricsHandler lyrics = null;
- AudioFrame firstAudio = null;
-
- TailStream tailStream = new TailStream(stream, 10240+128);
- MpegStream mpegStream = new MpegStream(tailStream);
-
- // ID3v2 tags live at the start of the file
- // You can apparently have several different ID3 tag blocks
- // So, keep going until we don't find any more
- MP3Frame f;
- while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
- if(f instanceof ID3v2Frame) {
- ID3v2Frame id3F = (ID3v2Frame)f;
- if (id3F.getMajorVersion() == 4) {
- v24 = new ID3v24Handler(id3F);
- } else if(id3F.getMajorVersion() == 3) {
- v23 = new ID3v23Handler(id3F);
- } else if(id3F.getMajorVersion() == 2) {
- v22 = new ID3v22Handler(id3F);
- }
- }
- }
-
- // Now iterate over all audio frames in the file
- AudioFrame frame = mpegStream.nextFrame();
- float duration = 0;
- boolean skipped = true;
- while (frame != null && skipped)
- {
- duration += frame.getDuration();
- if (firstAudio == null)
- {
- firstAudio = frame;
- }
- skipped = mpegStream.skipFrame();
- if (skipped) {
- frame = mpegStream.nextFrame();
- }
- }
-
- // ID3v1 tags live at the end of the file
- // Lyrics live just before ID3v1, at the end of the file
- // Search for both (handlers seek to the end for us)
- lyrics = new LyricsHandler(tailStream.getTail());
- v1 = lyrics.id3v1;
-
- // Go in order of preference
- // Currently, that's newest to oldest
- List<ID3Tags> tags = new ArrayList<ID3Tags>();
-
- if(v24 != null && v24.getTagsPresent()) {
- tags.add(v24);
- }
- if(v23 != null && v23.getTagsPresent()) {
- tags.add(v23);
- }
- if(v22 != null && v22.getTagsPresent()) {
- tags.add(v22);
- }
- if(v1 != null && v1.getTagsPresent()) {
- tags.add(v1);
- }
-
- ID3TagsAndAudio ret = new ID3TagsAndAudio();
- ret.audio = firstAudio;
- ret.lyrics = lyrics;
- ret.tags = tags.toArray(new ID3Tags[0]);
- ret.duration = duration;
- return ret;
- }
-
protected static class ID3TagsAndAudio {
private ID3Tags[] tags;
private AudioFrame audio;
private LyricsHandler lyrics;
private float duration; // Milliseconds
+
private float durationSeconds() {
- return duration / 1000;
+ return duration / 1000;
}
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
index 553b635..28b7a71 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
@@ -16,13 +16,12 @@
*/
package org.apache.tika.parser.mp3;
-import org.apache.commons.io.IOUtils;
-
-import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
+import org.apache.commons.io.IOUtils;
+
/**
* <p>
* A specialized stream class which can be used to extract single frames of MPEG
@@ -35,107 +34,213 @@ import java.io.PushbackInputStream;
* MPEG frames. Some meta information of frames can be queried.
* </p>
*/
-class MpegStream extends PushbackInputStream
-{
- /** Bit rate table for MPEG V1, layer 1. */
- private static final int[] BIT_RATE_MPEG1_L1 = {
- 0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000,
- 288000, 320000, 352000, 384000, 416000, 448000
- };
-
- /** Bit rate table for MPEG V1, layer 2. */
- private static final int[] BIT_RATE_MPEG1_L2 = {
- 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
- 160000, 192000, 224000, 256000, 320000, 384000
- };
-
- /** Bit rate table for MPEG V1, layer 3. */
- private static final int[] BIT_RATE_MPEG1_L3 = {
- 0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
- 160000, 192000, 224000, 256000, 320000
- };
-
- /** Bit rate table for MPEG V2/V2.5, layer 1. */
- private static final int[] BIT_RATE_MPEG2_L1 = {
- 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
- 144000, 160000, 176000, 192000, 224000, 256000
- };
-
- /** Bit rate table for MPEG V2/V2.5, layer 2 and 3. */
- private static final int[] BIT_RATE_MPEG2_L2 = {
- 0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000,
- 96000, 112000, 128000, 144000, 160000
- };
-
- /** Sample rate table for MPEG V1. */
- private static final int[] SAMPLE_RATE_MPEG1 = {
- 44100, 48000, 32000
- };
-
- /** Sample rate table for MPEG V2. */
- private static final int[] SAMPLE_RATE_MPEG2 = {
- 22050, 24000, 16000
- };
-
- /** Sample rate table for MPEG V2.5. */
- private static final int[] SAMPLE_RATE_MPEG2_5 = {
- 11025, 12000, 8000
- };
-
- /** Sample rate table for all MPEG versions. */
+class MpegStream extends PushbackInputStream {
+ /**
+ * Bit rate table for MPEG V1, layer 1.
+ */
+ private static final int[] BIT_RATE_MPEG1_L1 =
+ {0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000, 288000, 320000, 352000,
+ 384000, 416000, 448000};
+
+ /**
+ * Bit rate table for MPEG V1, layer 2.
+ */
+ private static final int[] BIT_RATE_MPEG1_L2 =
+ {0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 160000, 192000, 224000,
+ 256000, 320000, 384000};
+
+ /**
+ * Bit rate table for MPEG V1, layer 3.
+ */
+ private static final int[] BIT_RATE_MPEG1_L3 =
+ {0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 160000, 192000,
+ 224000, 256000, 320000};
+
+ /**
+ * Bit rate table for MPEG V2/V2.5, layer 1.
+ */
+ private static final int[] BIT_RATE_MPEG2_L1 =
+ {0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000,
+ 192000, 224000, 256000};
+
+ /**
+ * Bit rate table for MPEG V2/V2.5, layer 2 and 3.
+ */
+ private static final int[] BIT_RATE_MPEG2_L2 =
+ {0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
+ 144000, 160000};
+
+ /**
+ * Sample rate table for MPEG V1.
+ */
+ private static final int[] SAMPLE_RATE_MPEG1 = {44100, 48000, 32000};
+
+ /**
+ * Sample rate table for MPEG V2.
+ */
+ private static final int[] SAMPLE_RATE_MPEG2 = {22050, 24000, 16000};
+
+ /**
+ * Sample rate table for MPEG V2.5.
+ */
+ private static final int[] SAMPLE_RATE_MPEG2_5 = {11025, 12000, 8000};
+
+ /**
+ * Sample rate table for all MPEG versions.
+ */
private static final int[][] SAMPLE_RATE = createSampleRateTable();
- /** Constant for the number of samples for a layer 1 frame. */
+ /**
+ * Constant for the number of samples for a layer 1 frame.
+ */
private static final int SAMPLE_COUNT_L1 = 384;
- /** Constant for the number of samples for a layer 2 or 3 frame. */
+ /**
+ * Constant for the number of samples for a layer 2 or 3 frame.
+ */
private static final int SAMPLE_COUNT_L2 = 1152;
- /** Constant for the size of an MPEG frame header in bytes. */
+ /**
+ * Constant for the size of an MPEG frame header in bytes.
+ */
private static final int HEADER_SIZE = 4;
- /** The current MPEG header. */
+ /**
+ * The current MPEG header.
+ */
private AudioFrame currentHeader;
- /** A flag whether the end of the stream is reached. */
+ /**
+ * A flag whether the end of the stream is reached.
+ */
private boolean endOfStream;
/**
* Creates a new instance of {@code MpegStream} and initializes it with the
* underlying stream.
- *
+ *
* @param in the underlying audio stream
*/
- public MpegStream(InputStream in)
- {
+ public MpegStream(InputStream in) {
super(in, 2 * HEADER_SIZE);
}
/**
+ * Calculates the bit rate based on the given parameters.
+ *
+ * @param mpegVer the MPEG version
+ * @param layer the layer
+ * @param code the code for the bit rate
+ * @return the bit rate in bits per second
+ */
+ private static int calculateBitRate(int mpegVer, int layer, int code) {
+ int[] arr = null;
+
+ if (mpegVer == AudioFrame.MPEG_V1) {
+ switch (layer) {
+ case AudioFrame.LAYER_1:
+ arr = BIT_RATE_MPEG1_L1;
+ break;
+ case AudioFrame.LAYER_2:
+ arr = BIT_RATE_MPEG1_L2;
+ break;
+ case AudioFrame.LAYER_3:
+ arr = BIT_RATE_MPEG1_L3;
+ break;
+ }
+ } else {
+ if (layer == AudioFrame.LAYER_1) {
+ arr = BIT_RATE_MPEG2_L1;
+ } else {
+ arr = BIT_RATE_MPEG2_L2;
+ }
+ }
+ return arr[code];
+ }
+
+ /**
+ * Calculates the sample rate based on the given parameters.
+ *
+ * @param mpegVer the MPEG version
+ * @param code the code for the sample rate
+ * @return the sample rate in samples per second
+ */
+ private static int calculateSampleRate(int mpegVer, int code) {
+ return SAMPLE_RATE[mpegVer][code];
+ }
+
+ /**
+ * Calculates the length of an MPEG frame based on the given parameters.
+ *
+ * @param layer the layer
+ * @param bitRate the bit rate
+ * @param sampleRate the sample rate
+ * @param padding the padding flag
+ * @return the length of the frame in bytes
+ */
+ private static int calculateFrameLength(int layer, int bitRate, int sampleRate, int padding) {
+ if (layer == AudioFrame.LAYER_1) {
+ return (12 * bitRate / sampleRate + padding) * 4;
+ } else {
+ return 144 * bitRate / sampleRate + padding;
+ }
+ }
+
+ /**
+ * Calculates the duration of a MPEG frame based on the given parameters.
+ *
+ * @param layer the layer
+ * @param sampleRate the sample rate
+ * @return the duration of this frame in milliseconds
+ */
+ private static float calculateDuration(int layer, int sampleRate) {
+ int sampleCount = (layer == AudioFrame.LAYER_1) ? SAMPLE_COUNT_L1 : SAMPLE_COUNT_L2;
+ return (1000.0f / sampleRate) * sampleCount;
+ }
+
+ /**
+ * Calculates the number of channels based on the given parameters.
+ *
+ * @param chan the code for the channels
+ * @return the number of channels
+ */
+ private static int calculateChannels(int chan) {
+ return chan < 3 ? 2 : 1;
+ }
+
+ /**
+ * Creates the complete array for the sample rate mapping.
+ *
+ * @return the table for the sample rates
+ */
+ private static int[][] createSampleRateTable() {
+ int[][] arr = new int[4][];
+ arr[AudioFrame.MPEG_V1] = SAMPLE_RATE_MPEG1;
+ arr[AudioFrame.MPEG_V2] = SAMPLE_RATE_MPEG2;
+ arr[AudioFrame.MPEG_V2_5] = SAMPLE_RATE_MPEG2_5;
+ return arr;
+ }
+
+ /**
* Searches for the next MPEG frame header from the current stream position
* on. This method advances the underlying input stream until it finds a
* valid frame header or the end of the stream is reached. In the former
* case a corresponding {@code AudioFrame} object is created. In the latter
* case there are no more headers, so the end of the stream is probably
* reached.
- *
+ *
* @return the next {@code AudioFrame} or <b>null</b>
* @throws IOException if an IO error occurs
*/
- public AudioFrame nextFrame() throws IOException
- {
+ public AudioFrame nextFrame() throws IOException {
AudioFrame frame = null;
- while (!endOfStream && frame == null)
- {
+ while (!endOfStream && frame == null) {
findFrameSyncByte();
- if (!endOfStream)
- {
+ if (!endOfStream) {
HeaderBitField headerField = createHeaderField();
- if (!endOfStream)
- {
+ if (!endOfStream) {
frame = createHeader(headerField);
- if (frame == null)
- {
+ if (frame == null) {
pushBack(headerField);
}
}
@@ -152,14 +257,12 @@ class MpegStream extends PushbackInputStream
* the underlying stream is advanced to the end of the associated MPEG
* frame or until the EOF is reached. The return value indicates
* whether the full frame could be skipped.
- *
+ *
* @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise, perhaps EOF?
* @throws IOException if an IO error occurs
*/
- public boolean skipFrame() throws IOException
- {
- if (currentHeader != null)
- {
+ public boolean skipFrame() throws IOException {
+ if (currentHeader != null) {
long toSkip = currentHeader.getLength() - HEADER_SIZE;
long skipped = IOUtils.skip(in, toSkip);
currentHeader = null;
@@ -174,16 +277,13 @@ class MpegStream extends PushbackInputStream
/**
* Advances the underlying stream until the first byte of frame sync is
* found.
- *
+ *
* @throws IOException if an error occurs
*/
- private void findFrameSyncByte() throws IOException
- {
+ private void findFrameSyncByte() throws IOException {
boolean found = false;
- while (!found && !endOfStream)
- {
- if (nextByte() == 0xFF)
- {
+ while (!found && !endOfStream) {
+ if (nextByte() == 0xFF) {
found = true;
}
}
@@ -191,12 +291,11 @@ class MpegStream extends PushbackInputStream
/**
* Creates a bit field for the MPEG frame header.
- *
+ *
* @return the bit field
* @throws IOException if an error occurs
*/
- private HeaderBitField createHeaderField() throws IOException
- {
+ private HeaderBitField createHeaderField() throws IOException {
HeaderBitField field = new HeaderBitField();
field.add(nextByte());
field.add(nextByte());
@@ -207,14 +306,12 @@ class MpegStream extends PushbackInputStream
/**
* Creates an {@code AudioFrame} object based on the given header field. If
* the header field contains invalid values, result is <b>null</b>.
- *
+ *
* @param bits the header bit field
* @return the {@code AudioFrame}
*/
- private AudioFrame createHeader(HeaderBitField bits)
- {
- if (bits.get(21, 23) != 7)
- {
+ private AudioFrame createHeader(HeaderBitField bits) {
+ if (bits.get(21, 23) != 7) {
return null;
}
@@ -224,9 +321,8 @@ class MpegStream extends PushbackInputStream
int sampleRateCode = bits.get(10, 11);
int padding = bits.get(9);
- if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15
- || sampleRateCode == 3)
- {
+ if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15 ||
+ sampleRateCode == 3) {
// invalid header values
return null;
}
@@ -236,24 +332,20 @@ class MpegStream extends PushbackInputStream
int length = calculateFrameLength(layer, bitRate, sampleRate, padding);
float duration = calculateDuration(layer, sampleRate);
int channels = calculateChannels(bits.get(6, 7));
- return new AudioFrame(mpegVer, layer, bitRate, sampleRate, channels,
- length, duration);
+ return new AudioFrame(mpegVer, layer, bitRate, sampleRate, channels, length, duration);
}
/**
* Reads the next byte.
- *
+ *
* @return the next byte
* @throws IOException if an error occurs
*/
- private int nextByte() throws IOException
- {
+ private int nextByte() throws IOException {
int result = 0;
- if (!endOfStream)
- {
+ if (!endOfStream) {
result = read();
- if (result == -1)
- {
+ if (result == -1) {
endOfStream = true;
}
}
@@ -264,146 +356,30 @@ class MpegStream extends PushbackInputStream
* Pushes the given header field back in the stream so that the bytes are
* read again. This method is called if an invalid header was detected. Then
* search has to continue at the next byte after the frame sync byte.
- *
+ *
* @param field the header bit field with the invalid frame header
* @throws IOException if an error occurs
*/
- private void pushBack(HeaderBitField field) throws IOException
- {
+ private void pushBack(HeaderBitField field) throws IOException {
unread(field.toArray());
}
/**
- * Calculates the bit rate based on the given parameters.
- *
- * @param mpegVer the MPEG version
- * @param layer the layer
- * @param code the code for the bit rate
- * @return the bit rate in bits per second
- */
- private static int calculateBitRate(int mpegVer, int layer, int code)
- {
- int[] arr = null;
-
- if (mpegVer == AudioFrame.MPEG_V1)
- {
- switch (layer)
- {
- case AudioFrame.LAYER_1:
- arr = BIT_RATE_MPEG1_L1;
- break;
- case AudioFrame.LAYER_2:
- arr = BIT_RATE_MPEG1_L2;
- break;
- case AudioFrame.LAYER_3:
- arr = BIT_RATE_MPEG1_L3;
- break;
- }
- }
- else
- {
- if (layer == AudioFrame.LAYER_1)
- {
- arr = BIT_RATE_MPEG2_L1;
- }
- else
- {
- arr = BIT_RATE_MPEG2_L2;
- }
- }
- return arr[code];
- }
-
- /**
- * Calculates the sample rate based on the given parameters.
- *
- * @param mpegVer the MPEG version
- * @param code the code for the sample rate
- * @return the sample rate in samples per second
- */
- private static int calculateSampleRate(int mpegVer, int code)
- {
- return SAMPLE_RATE[mpegVer][code];
- }
-
- /**
- * Calculates the length of an MPEG frame based on the given parameters.
- *
- * @param layer the layer
- * @param bitRate the bit rate
- * @param sampleRate the sample rate
- * @param padding the padding flag
- * @return the length of the frame in bytes
- */
- private static int calculateFrameLength(int layer, int bitRate,
- int sampleRate, int padding)
- {
- if (layer == AudioFrame.LAYER_1)
- {
- return (12 * bitRate / sampleRate + padding) * 4;
- }
- else
- {
- return 144 * bitRate / sampleRate + padding;
- }
- }
-
- /**
- * Calculates the duration of a MPEG frame based on the given parameters.
- *
- * @param layer the layer
- * @param sampleRate the sample rate
- * @return the duration of this frame in milliseconds
- */
- private static float calculateDuration(int layer, int sampleRate)
- {
- int sampleCount =
- (layer == AudioFrame.LAYER_1) ? SAMPLE_COUNT_L1
- : SAMPLE_COUNT_L2;
- return (1000.0f / sampleRate) * sampleCount;
- }
-
- /**
- * Calculates the number of channels based on the given parameters.
- *
- * @param chan the code for the channels
- * @return the number of channels
- */
- private static int calculateChannels(int chan)
- {
- return chan < 3 ? 2 : 1;
- }
-
- /**
- * Creates the complete array for the sample rate mapping.
- *
- * @return the table for the sample rates
- */
- private static int[][] createSampleRateTable()
- {
- int[][] arr = new int[4][];
- arr[AudioFrame.MPEG_V1] = SAMPLE_RATE_MPEG1;
- arr[AudioFrame.MPEG_V2] = SAMPLE_RATE_MPEG2;
- arr[AudioFrame.MPEG_V2_5] = SAMPLE_RATE_MPEG2_5;
- return arr;
- }
-
- /**
* A class representing the bit field of an MPEG header. It allows
* convenient access to specific bit groups.
*/
- private static class HeaderBitField
- {
- /** The internal value. */
+ private static class HeaderBitField {
+ /**
+ * The internal value.
+ */
private int value;
/**
* Adds a byte to this field.
- *
+ *
* @param b the byte to be added
*/
- public void add(int b)
- {
+ public void add(int b) {
value <<= 8;
value |= b;
}
@@ -412,13 +388,12 @@ class MpegStream extends PushbackInputStream
* Returns the value of the bit group from the given start and end
* index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the
* first 4 bits.
- *
+ *
* @param from index
- * @param to the to index
+ * @param to the to index
* @return the value of this group of bits
*/
- public int get(int from, int to)
- {
+ public int get(int from, int to) {
int shiftVal = value >> from;
int mask = (1 << (to - from + 1)) - 1;
return shiftVal & mask;
@@ -427,23 +402,21 @@ class MpegStream extends PushbackInputStream
/**
* Returns the value of the bit with the given index. The bit index is
* 0-based. Result is either 0 or 1, depending on the value of this bit.
- *
+ *
* @param bit the bit index
* @return the value of this bit
*/
- public int get(int bit)
- {
+ public int get(int bit) {
return get(bit, bit);
}
/**
* Returns the internal value of this field as an array. The array
* contains 3 bytes.
- *
+ *
* @return the internal value of this field as int array
*/
- public byte[] toArray()
- {
+ public byte[] toArray() {
byte[] result = new byte[3];
result[0] = (byte) get(16, 23);
result[1] = (byte) get(8, 15);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/ISO6709Extractor.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/ISO6709Extractor.java
index 9be34f9..4e232ef 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/ISO6709Extractor.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/ISO6709Extractor.java
@@ -49,11 +49,13 @@ class ISO6709Extractor implements Serializable {
private String getLng(String sign, String integer, String flot) {
String flotNormed = (flot == null) ? "" : flot;
if (integer.length() == 3) {
- return sign+integer+flotNormed;
+ return sign + integer + flotNormed;
} else if (integer.length() == 5) {
- return calcDecimalDegrees(sign, integer.substring(0,3), integer.substring(3,5)+flotNormed);
+ return calcDecimalDegrees(sign, integer.substring(0, 3),
+ integer.substring(3, 5) + flotNormed);
} else if (integer.length() == 7) {
- return calcDecimalDegrees(sign, integer.substring(0,3), integer.substring(3,5), integer.substring(5,7)+flotNormed);
+ return calcDecimalDegrees(sign, integer.substring(0, 3), integer.substring(3, 5),
+ integer.substring(5, 7) + flotNormed);
} else {
//ignore problems for now?
}
@@ -63,11 +65,13 @@ class ISO6709Extractor implements Serializable {
private String getLat(String sign, String integer, String flot) {
String flotNormed = (flot == null) ? "" : flot;
if (integer.length() == 2) {
- return sign+integer+flotNormed;
+ return sign + integer + flotNormed;
} else if (integer.length() == 4) {
- return calcDecimalDegrees(sign, integer.substring(0,2), integer.substring(2,4)+flotNormed);
+ return calcDecimalDegrees(sign, integer.substring(0, 2),
+ integer.substring(2, 4) + flotNormed);
} else if (integer.length() == 6) {
- return calcDecimalDegrees(sign, integer.substring(0,2), integer.substring(2,4), integer.substring(4,6)+flotNormed);
+ return calcDecimalDegrees(sign, integer.substring(0, 2), integer.substring(2, 4),
+ integer.substring(4, 6) + flotNormed);
} else {
//ignore problems for now?
}
@@ -76,14 +80,14 @@ class ISO6709Extractor implements Serializable {
private String calcDecimalDegrees(String sign, String degrees, String minutes) {
double d = Integer.parseInt(degrees);
- d += (Double.parseDouble(minutes)/60);
- return sign+String.format(Locale.ROOT, "%.8f", d);
+ d += (Double.parseDouble(minutes) / 60);
+ return sign + String.format(Locale.ROOT, "%.8f", d);
}
private String calcDecimalDegrees(String sign, String degrees, String minutes, String seconds) {
double d = Integer.parseInt(degrees);
- d += (Double.parseDouble(minutes)/60);
- d += (Double.parseDouble(seconds)/3600);
- return sign+String.format(Locale.ROOT, "%.8f", d);
+ d += (Double.parseDouble(minutes) / 60);
+ d += (Double.parseDouble(seconds) / 3600);
+ return sign + String.format(Locale.ROOT, "%.8f", d);
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index dec7abd..e9e21c4 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -16,19 +16,19 @@
*/
package org.apache.tika.parser.mp4;
-import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMP;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
import org.mp4parser.Box;
import org.mp4parser.Container;
import org.mp4parser.IsoFile;
@@ -60,68 +60,86 @@ import org.mp4parser.boxes.sampleentry.AudioSampleEntry;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Optional;
-import java.util.Set;
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* Parser for the MP4 media container format, as well as the older
- * QuickTime format that MP4 is based on.
- *
+ * QuickTime format that MP4 is based on.
+ * <p>
* This uses the MP4Parser project from http://code.google.com/p/mp4parser/
- * to do the underlying parsing
+ * to do the underlying parsing
*/
public class MP4Parser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 84011216792285L;
- /** TODO Replace this with a 2dp Duration Property Converter */
- private static final DecimalFormat DURATION_FORMAT =
- (DecimalFormat)NumberFormat.getNumberInstance(Locale.ROOT);
- static {
- DURATION_FORMAT.applyPattern("0.0#");
- }
+ /**
+ * TODO Replace this with a 2dp Duration Property Converter
+ */
+ private static final DecimalFormat DURATION_FORMAT =
+ (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT);
// Ensure this stays in Sync with the entries in tika-mimetypes.xml
private static final Map<MediaType, List<String>> typesMap = new HashMap<>();
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(typesMap.keySet());
static {
- // All types should be 4 bytes long, space padded as needed
- typesMap.put(MediaType.audio("mp4"), Arrays.asList(
- "M4A ", "M4B ", "F4A ", "F4B "));
- typesMap.put(MediaType.video("3gpp"), Arrays.asList(
- "3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", "3gp6", "3gs7"));
- typesMap.put(MediaType.video("3gpp2"), Arrays.asList(
- "3g2a", "3g2b", "3g2c"));
- typesMap.put(MediaType.video("mp4"), Arrays.asList(
- "mp41", "mp42"));
- typesMap.put(MediaType.video("x-m4v"), Arrays.asList(
- "M4V ", "M4VH", "M4VP"));
-
- typesMap.put(MediaType.video("quicktime"), Collections.emptyList());
- typesMap.put(MediaType.application("mp4"), Collections.emptyList());
+ DURATION_FORMAT.applyPattern("0.0#");
}
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(typesMap.keySet());
+ static {
+ // All types should be 4 bytes long, space padded as needed
+ typesMap.put(MediaType.audio("mp4"), Arrays.asList("M4A ", "M4B ", "F4A ", "F4B "));
+ typesMap.put(MediaType.video("3gpp"),
+ Arrays.asList("3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5",
+ "3gp6", "3gs7"));
+ typesMap.put(MediaType.video("3gpp2"), Arrays.asList("3g2a", "3g2b", "3g2c"));
+ typesMap.put(MediaType.video("mp4"), Arrays.asList("mp41", "mp42"));
+ typesMap.put(MediaType.video("x-m4v"), Arrays.asList("M4V ", "M4VH", "M4VP"));
+
+ typesMap.put(MediaType.video("quicktime"), Collections.emptyList());
+ typesMap.put(MediaType.application("mp4"), Collections.emptyList());
+ }
private ISO6709Extractor iso6709Extractor = new ISO6709Extractor();
+ private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
+ if (metadata != null) {
+ m.set(prop, metadata.getValue());
+ }
+ }
+
+ private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
+ if (box == null) {
+ return null;
+ }
+
+ List<T> boxes = box.getBoxes(clazz);
+ if (boxes.size() == 0) {
+ return null;
+ }
+ return boxes.get(0);
+ }
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
// The MP4Parser library accepts either a File, or a byte array
// As MP4 video files are typically large, always use a file to
@@ -135,20 +153,14 @@ public class MP4Parser extends AbstractParser {
FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
if (fileType != null) {
// Identify the type based on the major brand
- Optional<MediaType> typeHolder = typesMap.entrySet()
- .stream()
- .filter(e -> e.getValue().contains(fileType.getMajorBrand()))
- .findFirst()
+ Optional<MediaType> typeHolder = typesMap.entrySet().stream()
+ .filter(e -> e.getValue().contains(fileType.getMajorBrand())).findFirst()
.map(Map.Entry::getKey);
if (!typeHolder.isPresent()) {
// If no match for major brand, see if any of the compatible brands match
- typeHolder = typesMap.entrySet()
- .stream()
- .filter(e -> e.getValue()
- .stream()
- .anyMatch(fileType.getCompatibleBrands()::contains))
- .findFirst()
+ typeHolder = typesMap.entrySet().stream().filter(e -> e.getValue().stream()
+ .anyMatch(fileType.getCompatibleBrands()::contains)).findFirst()
.map(Map.Entry::getKey);
}
@@ -227,8 +239,10 @@ public class MP4Parser extends AbstractParser {
// Look for the first Audio Sample, if present
AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
if (sample != null) {
- XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
- //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
+ XMPDM.ChannelTypePropertyConverter
+ .convertAndSet(metadata, sample.getChannelCount());
+ //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());
+ // TODO Num -> Type mapping
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
//metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
//metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
@@ -255,7 +269,8 @@ public class MP4Parser extends AbstractParser {
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
}
- private void handleApple(MetaBox metaBox, Metadata metadata, XHTMLContentHandler xhtml) throws SAXException {
+ private void handleApple(MetaBox metaBox, Metadata metadata, XHTMLContentHandler xhtml)
+ throws SAXException {
AppleItemListBox apple = getOrNull(metaBox, AppleItemListBox.class);
if (apple == null) {
return;
@@ -350,20 +365,4 @@ public class MP4Parser extends AbstractParser {
String iso6709 = coordBox.getValue();
iso6709Extractor.extract(iso6709, metadata);
}
-
- private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
- if (metadata != null) {
- m.set(prop, metadata.getValue());
- }
- }
-
- private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
- if (box == null) return null;
-
- List<T> boxes = box.getBoxes(clazz);
- if (boxes.size() == 0) {
- return null;
- }
- return boxes.get(0);
- }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
index 947b694..1221ef3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.video;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
@@ -28,16 +30,15 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
/**
* <p>
@@ -64,16 +65,16 @@ import static java.nio.charset.StandardCharsets.UTF_8;
*/
public class FLVParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -8718013155719197679L;
-
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.video("x-flv"));
private static int TYPE_METADATA = 0x12;
private static byte MASK_AUDIO = 1;
private static byte MASK_VIDEO = 4;
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.video("x-flv"));
-
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -83,38 +84,37 @@ public class FLVParser extends AbstractParser {
}
private int readUInt24(DataInputStream input) throws IOException {
- int uint = input.read()<<16;
- uint += input.read()<<8;
- uint += input.read();
+ int uint = input.read() << 16;
+ uint += input.read() << 8;
+ uint += input.read();
return uint;
}
- private Object readAMFData(DataInputStream input, int type)
- throws IOException {
+ private Object readAMFData(DataInputStream input, int type) throws IOException {
if (type == -1) {
type = input.readUnsignedByte();
}
switch (type) {
- case 0:
- return input.readDouble();
- case 1:
- return input.readUnsignedByte() == 1;
- case 2:
- return readAMFString(input);
- case 3:
- return readAMFObject(input);
- case 8:
- return readAMFEcmaArray(input);
- case 10:
- return readAMFStrictArray(input);
- case 11:
- final Date date = new Date((long) input.readDouble());
- input.readShort(); // time zone
- return date;
- case 13:
- return "UNDEFINED";
- default:
- return null;
+ case 0:
+ return input.readDouble();
+ case 1:
+ return input.readUnsignedByte() == 1;
+ case 2:
+ return readAMFString(input);
+ case 3:
+ return readAMFObject(input);
+ case 8:
+ return readAMFEcmaArray(input);
+ case 10:
+ return readAMFStrictArray(input);
+ case 11:
+ final Date date = new Date((long) input.readDouble());
+ input.readShort(); // time zone
+ return date;
+ case 13:
+ return "UNDEFINED";
+ default:
+ return null;
}
}
@@ -163,10 +163,8 @@ public class FLVParser extends AbstractParser {
return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
DataInputStream datainput = new DataInputStream(stream);
if (!checkSignature(datainput)) {
throw new TikaException("FLV signature not detected");
@@ -190,8 +188,7 @@ public class FLVParser extends AbstractParser {
long sizePrev = readUInt32(datainput);
if (sizePrev != 0) {
// should be 0, perhaps this is not flv?
- throw new TikaException(
- "Unpexpected FLV first previous block size: " + sizePrev);
+ throw new TikaException("Unpexpected FLV first previous block size: " + sizePrev);
}
metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
@@ -216,9 +213,9 @@ public class FLVParser extends AbstractParser {
if (type == TYPE_METADATA) {
// found metadata Tag, read content to buffer
byte[] metaBytes = new byte[datalen];
- for (int readCount = 0; readCount < datalen;) {
+ for (int readCount = 0; readCount < datalen; ) {
int r = stream.read(metaBytes, readCount, datalen - readCount);
- if(r!=-1) {
+ if (r != -1) {
readCount += r;
} else {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
index 9cfbab1..40e90b8 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
@@ -18,9 +18,10 @@ package org.apache.tika.parser.audio;
import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
public class AudioParserTest {
@@ -28,8 +29,8 @@ public class AudioParserTest {
public void testWAV() throws Exception {
String path = "/test-documents/testWAV.wav";
Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
+ String content =
+ new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/vnd.wave", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("44100.0", metadata.get("samplerate"));
@@ -44,8 +45,8 @@ public class AudioParserTest {
public void testAIFF() throws Exception {
String path = "/test-documents/testAIFF.aif";
Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
+ String content =
+ new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("44100.0", metadata.get("samplerate"));
@@ -60,8 +61,8 @@ public class AudioParserTest {
public void testAU() throws Exception {
String path = "/test-documents/testAU.au";
Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- AudioParserTest.class.getResourceAsStream(path), metadata);
+ String content =
+ new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("44100.0", metadata.get("samplerate"));
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
index 344f2d7..3066722 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
@@ -16,12 +16,13 @@
*/
package org.apache.tika.parser.audio;
-import static org.junit.Assert.assertEquals;
import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
public class MidiParserTest {
@@ -29,8 +30,8 @@ public class MidiParserTest {
public void testMID() throws Exception {
String path = "/test-documents/testMID.mid";
Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- MidiParserTest.class.getResourceAsStream(path), metadata);
+ String content =
+ new Tika().parseToString(MidiParserTest.class.getResourceAsStream(path), metadata);
assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2", metadata.get("tracks"));
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
index ed0b16c..34dc34f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
@@ -20,12 +20,13 @@ import static org.junit.Assert.assertEquals;
import java.io.ByteArrayInputStream;
+import org.junit.Assume;
+import org.junit.Test;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
-import org.junit.Assume;
-import org.junit.Test;
/**
* Test case for parsing mp3 files.
@@ -34,16 +35,17 @@ public class Mp3ParserTest extends TikaTest {
/**
* Checks the duration of an MP3 file.
+ *
* @param metadata the metadata object
* @param expected the expected duration, rounded as seconds
*/
private static void checkDuration(Metadata metadata, int expected) {
- assertEquals("Wrong duration", expected,
+ assertEquals("Wrong duration", expected,
Math.round(Float.valueOf(metadata.get(XMPDM.DURATION))));
}
/**
- * Test that with only ID3v1 tags, we get some information out
+ * Test that with only ID3v1 tags, we get some information out
*/
@Test
public void testMp3ParsingID3v1() throws Exception {
@@ -61,7 +63,7 @@ public class Mp3ParserTest extends TikaTest {
assertContains("2008", content);
assertContains("Test Comment", content);
assertContains("Rock", content);
-
+
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("1", metadata.get("channels"));
@@ -70,7 +72,7 @@ public class Mp3ParserTest extends TikaTest {
/**
* Test that with only ID3v2 tags, we get the full
- * set of information out.
+ * set of information out.
*/
@Test
public void testMp3ParsingID3v2() throws Exception {
@@ -91,12 +93,12 @@ public class Mp3ParserTest extends TikaTest {
assertContains("Rock", content);
assertContains(", track 1", content);
assertContains(", disc 1", content);
-
+
// Check un-typed audio properties
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("1", metadata.get("channels"));
-
+
// Check XMPDM-typed audio properties
assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
@@ -104,11 +106,12 @@ public class Mp3ParserTest extends TikaTest {
assertEquals(null, metadata.get(XMPDM.COMPOSER));
assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
assertEquals("Rock", metadata.get(XMPDM.GENRE));
- assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
+ assertEquals("XXX - ID3v1 Comment\nTest Comment",
+ metadata.get(XMPDM.LOG_COMMENT.getName()));
assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER));
assertEquals("1", metadata.get(XMPDM.COMPILATION));
-
+
assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
@@ -122,15 +125,14 @@ public class Mp3ParserTest extends TikaTest {
@Test
public void testAddingToMetadataBeforeWriting() throws Exception {
String content = getXML("testMP3id3v1.mp3").xml;
- assertContains("<meta name=\"xmpDM:audioSampleRate\" content=\"44100\"",
- content);
- assertContains("<meta name=\"xmpDM:duration\" content=\"2.455",
- content);
+ assertContains("<meta name=\"xmpDM:audioSampleRate\" content=\"44100\"", content);
+ assertContains("<meta name=\"xmpDM:duration\" content=\"2.455", content);
assertContains("meta name=\"xmpDM:audioChannelType\" content=\"Mono\"", content);
}
+
/**
* Test that with both id3v2 and id3v1, we prefer the
- * details from id3v2
+ * details from id3v2
*/
@Test
public void testMp3ParsingID3v1v2() throws Exception {
@@ -147,7 +149,7 @@ public class Mp3ParserTest extends TikaTest {
assertContains("2008", content);
assertContains("Test Comment", content);
assertContains("Rock", content);
-
+
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("1", metadata.get("channels"));
@@ -156,7 +158,7 @@ public class Mp3ParserTest extends TikaTest {
/**
* Test that with only ID3v2 tags, of version 2.4, we get the full
- * set of information out.
+ * set of information out.
*/
@Test
public void testMp3ParsingID3v24() throws Exception {
@@ -174,7 +176,7 @@ public class Mp3ParserTest extends TikaTest {
assertContains("Test Comment", content);
assertContains("Rock", content);
assertContains(", disc 1", content);
-
+
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("1", metadata.get("channels"));
@@ -188,36 +190,35 @@ public class Mp3ParserTest extends TikaTest {
assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
assertEquals("Rock", metadata.get(XMPDM.GENRE));
assertEquals("1", metadata.get(XMPDM.COMPILATION));
-
+
assertEquals(null, metadata.get(XMPDM.TRACK_NUMBER));
assertEquals("1", metadata.get(XMPDM.DISC_NUMBER));
}
-
+
/**
* Tests that a file with characters not in the ISO 8859-1
- * range is correctly handled
+ * range is correctly handled
*/
@Test
public void testMp3ParsingID3i18n() throws Exception {
Metadata metadata = new Metadata();
String content = getText("testMP3i18n.mp3", metadata);
- assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
- assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
-
- assertEquals(
- "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
- metadata.get(XMPDM.LOG_COMMENT)
- );
-
- assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
- assertEquals("44100", metadata.get("samplerate"));
- assertEquals("1", metadata.get("channels"));
- checkDuration(metadata, 2);
- }
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+ assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+ assertEquals("Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
+ metadata.get(XMPDM.LOG_COMMENT));
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("1", metadata.get("channels"));
+ checkDuration(metadata, 2);
+ }
+
/**
* Tests that a file with the last frame slightly
* truncated does not cause an EOF and does
@@ -234,20 +235,18 @@ public class Mp3ParserTest extends TikaTest {
assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
- assertEquals(
- "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
- metadata.get(XMPDM.LOG_COMMENT)
- );
+ assertEquals("Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
+ metadata.get(XMPDM.LOG_COMMENT));
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("1", metadata.get("channels"));
checkDuration(metadata, 2);
}
-
+
/**
* Tests that a file with both lyrics and
- * ID3v2 tags gets both extracted correctly
+ * ID3v2 tags gets both extracted correctly
*/
@Test
public void testMp3ParsingLyrics() throws Exception {
@@ -268,53 +267,49 @@ public class Mp3ParserTest extends TikaTest {
assertContains("2008", content);
assertContains("Test Comment", content);
assertContains("Rock", content);
-
+
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("2", metadata.get("channels"));
checkDuration(metadata, 1);
}
-
+
@Test
public void testID3v2Frame() throws Exception {
- byte[] empty = new byte[] {
- 0x49, 0x44, 0x33, 3, 1, 0,
- 0, 0, 0, 0
- };
-
- assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
- assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
-
- ID3v2Frame f = (ID3v2Frame)
- ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
- assertEquals(3, f.getMajorVersion());
- assertEquals(1, f.getMinorVersion());
- assertEquals(0, f.getFlags());
- assertEquals(0, f.getLength());
- assertEquals(0, f.getData().length);
-
- assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
- assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
- assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
+ byte[] empty = new byte[]{0x49, 0x44, 0x33, 3, 1, 0, 0, 0, 0, 0};
+
+ assertEquals(11, ID3v2Frame.getInt(new byte[]{0, 0, 0, 0x0b}));
+ assertEquals(257, ID3v2Frame.getInt(new byte[]{0, 0, 1, 1}));
+
+ ID3v2Frame f =
+ (ID3v2Frame) ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+ assertEquals(3, f.getMajorVersion());
+ assertEquals(1, f.getMinorVersion());
+ assertEquals(0, f.getFlags());
+ assertEquals(0, f.getLength());
+ assertEquals(0, f.getData().length);
+
+ assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
+ assertEquals("", ID3v2Frame.getTagString(new byte[]{0, 0, 0, 0}, 0, 3));
+ assertEquals("A", ID3v2Frame.getTagString(new byte[]{(byte) 'A', 0, 0, 0}, 0, 3));
}
@Test
public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
- assertEquals("2.4555110931396484",
- getXML("testMP3noid3.mp3").metadata.get(XMPDM.DURATION));
+ assertEquals("2.4555110931396484", getXML("testMP3noid3.mp3").metadata.get(XMPDM.DURATION));
}
-
+
/**
* This test will do nothing, unless you've downloaded the
- * mp3 file from TIKA-424 - the file cannot be
- * distributed with Tika.
+ * mp3 file from TIKA-424 - the file cannot be
+ * distributed with Tika.
* This test will check for the complicated set of ID3v2.4
- * tags.
+ * tags.
*/
@Test
public void testTIKA424() throws Exception {
- Assume.assumeTrue(Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/test2.mp3") != null);
+ Assume.assumeTrue(
+ Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3") != null);
Metadata metadata = new Metadata();
String content = getText("test2.mp3", metadata);
@@ -324,18 +319,18 @@ public class Mp3ParserTest extends TikaTest {
assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
assertContains("Plus loin vers l'ouest", content);
-
+
assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
assertEquals("44100", metadata.get("samplerate"));
assertEquals("2", metadata.get("channels"));
}
-
+
/**
* This tests that we can handle without errors (but perhaps not
- * all content) a file with a very very large ID3 frame that
- * has been truncated before the end of the ID3 tags.
+ * all content) a file with a very very large ID3 frame that
+ * has been truncated before the end of the ID3 tags.
* In this case, it is a file with JPEG data in the ID3, which
- * is truncated before the end of the JPEG bit of the ID3 frame.
+ * is truncated before the end of the JPEG bit of the ID3 frame.
*/
@Test
public void testTIKA474() throws Exception {
@@ -351,7 +346,7 @@ public class Mp3ParserTest extends TikaTest {
assertContains("The White Stripes", content);
assertContains("Elephant", content);
assertContains("2003", content);
-
+
// File lacks any audio frames, so we can't know these
assertEquals(null, metadata.get("version"));
assertEquals(null, metadata.get("samplerate"));
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
index 622dcf7..e8ce8b7 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java
@@ -33,81 +33,71 @@ import org.junit.Test;
/**
* Test class for {@code MpegStream}.
*/
-public class MpegStreamTest
-{
- /** The stream to be tested. */
- private MpegStream stream;
-
- @After
- public void tearDown() throws Exception
- {
- if (stream != null)
- {
- stream.close();
- }
- }
-
+public class MpegStreamTest {
/**
- * Tests whether the default test header can be found in a stream.
- *
- * @param bos the stream
- * @throws IOException if an error occurs
+ * The stream to be tested.
*/
- private void checkDefaultHeader(ByteArrayOutputStream bos)
- throws IOException
- {
- ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
- stream = new MpegStream(in);
- AudioFrame header = stream.nextFrame();
- assertNotNull("No header found", header);
- assertEquals("Wrong MPEG version", AudioFrame.MPEG_V2,
- header.getVersionCode());
- assertEquals("Wrong layer", AudioFrame.LAYER_3, header.getLayer());
- assertEquals("Wrong bit rate", 80000, header.getBitRate());
- assertEquals("Wrong sample rate", 24000, header.getSampleRate());
- }
+ private MpegStream stream;
/**
* Writes the given byte the given number of times into an output stream.
- *
- * @param out the output stream
+ *
+ * @param out the output stream
* @param value the value to write
* @param count the number of bytes to write
* @throws IOException if an error occurs
*/
- private static void writeBytes(OutputStream out, int value, int count)
- throws IOException
- {
- for (int i = 0; i < count; i++)
- {
+ private static void writeBytes(OutputStream out, int value, int count) throws IOException {
+ for (int i = 0; i < count; i++) {
out.write(value);
}
}
/**
* Writes a frame header in the given output stream.
- *
+ *
* @param out the output stream
- * @param b2 byte 2 of the header
- * @param b3 byte 3 of the header
- * @param b4 byte 4 of the header
+ * @param b2 byte 2 of the header
+ * @param b3 byte 3 of the header
+ * @param b4 byte 4 of the header
* @throws IOException if an error occurs
*/
- private static void writeFrame(OutputStream out, int b2, int b3, int b4)
- throws IOException
- {
+ private static void writeFrame(OutputStream out, int b2, int b3, int b4) throws IOException {
out.write(0xFF);
out.write(b2);
out.write(b3);
out.write(b4);
}
+ @After
+ public void tearDown() throws Exception {
+ if (stream != null) {
+ stream.close();
+ }
+ }
+
+ /**
+ * Tests whether the default test header can be found in a stream.
+ *
+ * @param bos the stream
+ * @throws IOException if an error occurs
+ */
+ private void checkDefaultHeader(ByteArrayOutputStream bos) throws IOException {
+ ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
+ stream = new MpegStream(in);
+ AudioFrame header = stream.nextFrame();
+ assertNotNull("No header found", header);
+ assertEquals("Wrong MPEG version", AudioFrame.MPEG_V2, header.getVersionCode());
+ assertEquals("Wrong layer", AudioFrame.LAYER_3, header.getLayer());
+ assertEquals("Wrong bit rate", 80000, header.getBitRate());
+ assertEquals("Wrong sample rate", 24000, header.getSampleRate());
+ }
+
/**
* Tests whether an audio frame header can be found somewhere in a stream.
*/
@Test
- public void testSearchNextFrame() throws IOException
- {
+ public void testSearchNextFrame() throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
writeBytes(bos, 0xFF, 32);
writeBytes(bos, 0, 16);
@@ -122,8 +112,7 @@ public class MpegStreamTest
* Tests whether invalid frame headers are detected and skipped.
*/
@Test
- public void testSearchNextFrameInvalid() throws IOException
- {
+ public void testSearchNextFrameInvalid() throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
writeFrame(bos, 0xEB, 0x96, 0);
writeFrame(bos, 0xF9, 0x96, 0);
@@ -139,8 +128,7 @@ public class MpegStreamTest
* ends.
*/
@Test
- public void testSeachNextFrameEOS() throws IOException
- {
+ public void testSeachNextFrameEOS() throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
bos.write(0xFF);
bos.write(0xFF);
@@ -155,8 +143,7 @@ public class MpegStreamTest
* Tries to skip a frame if no current header is available.
*/
@Test
- public void testSkipNoCurrentHeader() throws IOException
- {
+ public void testSkipNoCurrentHeader() throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
bos.write("This is a test".getBytes(UTF_8));
ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
index cba30cf..27d25fd 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -18,8 +18,8 @@ package org.apache.tika.parser.mp4;
import static org.junit.Assert.assertEquals;
-import java.io.InputStream;
-import java.nio.file.Paths;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
@@ -29,9 +29,6 @@ import org.apache.tika.metadata.XMP;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ContentHandlerFactory;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
/**
* Test case for parsing mp4 files.
@@ -39,7 +36,7 @@ import org.xml.sax.ContentHandler;
public class MP4ParserTest extends TikaTest {
/**
* Test that we can extract information from
- * a M4A MP4 Audio file
+ * a M4A MP4 Audio file
*/
@Test
public void testMP4ParsingAudio() throws Exception {
@@ -60,7 +57,7 @@ public class MP4ParserTest extends TikaTest {
assertContains("2008", content);
assertContains("Test Comment", content);
assertContains("Test Genre", content);
-
+
// Check XMPDM-typed audio properties
assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
@@ -72,28 +69,29 @@ public class MP4ParserTest extends TikaTest {
assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
assertEquals("6", metadata.get(XMPDM.DISC_NUMBER));
assertEquals("0", metadata.get(XMPDM.COMPILATION));
-
-
+
+
assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
assertEquals("0.07", metadata.get(XMPDM.DURATION));
-
+
assertEquals("iTunes 10.5.3.3", metadata.get(XMP.CREATOR_TOOL));
-
-
+
+
// Check again by file, rather than stream
- TikaInputStream tstream = TikaInputStream.get(getResourceAsStream("/test-documents/testMP4.m4a"));
+ TikaInputStream tstream =
+ TikaInputStream.get(getResourceAsStream("/test-documents/testMP4.m4a"));
tstream.getFile();
ContentHandler handler = new BodyContentHandler();
try {
- AUTO_DETECT_PARSER.parse(tstream, handler, metadata, new ParseContext());
+ AUTO_DETECT_PARSER.parse(tstream, handler, metadata, new ParseContext());
} finally {
- tstream.close();
+ tstream.close();
}
//TODO: why don't we check the output here?
}
-
+
// TODO Test a MP4 Video file
// TODO Test an old QuickTime Video File
@Test(timeout = 30000)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
index d3a876e..e09f018 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
@@ -18,9 +18,10 @@ package org.apache.tika.parser.video;
import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
-import org.junit.Test;
public class FLVParserTest {
@@ -29,8 +30,8 @@ public class FLVParserTest {
String path = "/test-documents/testFLV.flv";
Metadata metadata = new Metadata();
- String content = new Tika().parseToString(
- FLVParserTest.class.getResourceAsStream(path), metadata);
+ String content =
+ new Tika().parseToString(FLVParserTest.class.getResourceAsStream(path), metadata);
assertEquals("", content);
assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index c54baf4..d3128b2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -23,6 +23,9 @@ import java.util.Set;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.StringUtil;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
@@ -32,78 +35,65 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* DWG (CAD Drawing) parser. This is a very basic parser, which just
- * looks for bits of the headers.
+ * looks for bits of the headers.
* Note that we use Apache POI for various parts of the processing, as
- * lots of the low level string/int/short concepts are the same.
+ * lots of the low level string/int/short concepts are the same.
*/
public class DWGParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -7744232583079169119L;
-
- private static MediaType TYPE = MediaType.image("vnd.dwg");
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(TYPE);
- }
-
- /** The order of the fields in the header */
- private static final Property[] HEADER_PROPERTIES_ENTRIES = {
- TikaCoreProperties.TITLE,
- TikaCoreProperties.DESCRIPTION,
- TikaCoreProperties.CREATOR,
- TikaCoreProperties.SUBJECT,
- TikaCoreProperties.COMMENTS,
- TikaCoreProperties.MODIFIER,
- null, // Unknown?
- TikaCoreProperties.RELATION, // Hyperlink
- };
-
- /** For the 2000 file, they're indexed */
- private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
- null,
- TikaCoreProperties.RELATION, // 0x01
- TikaCoreProperties.TITLE, // 0x02
- TikaCoreProperties.DESCRIPTION, // 0x03
- TikaCoreProperties.CREATOR, // 0x04
- null,
- TikaCoreProperties.COMMENTS,// 0x06
- TikaCoreProperties.SUBJECT, // 0x07
- TikaCoreProperties.MODIFIER, // 0x08
- };
-
- private static final String HEADER_2000_PROPERTIES_MARKER_STR =
- "DWGPROPS COOKIE";
-
+ /**
+ * The order of the fields in the header
+ */
+ private static final Property[] HEADER_PROPERTIES_ENTRIES =
+ {TikaCoreProperties.TITLE, TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR,
+ TikaCoreProperties.SUBJECT, TikaCoreProperties.COMMENTS,
+ TikaCoreProperties.MODIFIER, null, // Unknown?
+ TikaCoreProperties.RELATION, // Hyperlink
+ };
+ /**
+ * For the 2000 file, they're indexed
+ */
+ private static final Property[] HEADER_2000_PROPERTIES_ENTRIES =
+ {null, TikaCoreProperties.RELATION, // 0x01
+ TikaCoreProperties.TITLE, // 0x02
+ TikaCoreProperties.DESCRIPTION, // 0x03
+ TikaCoreProperties.CREATOR, // 0x04
+ null, TikaCoreProperties.COMMENTS,// 0x06
+ TikaCoreProperties.SUBJECT, // 0x07
+ TikaCoreProperties.MODIFIER, // 0x08
+ };
+ private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE";
private static final byte[] HEADER_2000_PROPERTIES_MARKER =
new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
-
- static {
- StringUtil.putCompressedUnicode(
- HEADER_2000_PROPERTIES_MARKER_STR,
- HEADER_2000_PROPERTIES_MARKER, 0);
- }
-
- /**
+ /**
* How far to skip after the last standard property, before
- * we find any custom properties that might be there.
+ * we find any custom properties that might be there.
*/
private static final int CUSTOM_PROPERTIES_SKIP = 20;
-
- /**
+ /**
* The value of padding bytes other than 0 in some DWG files.
*/
- private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
+ private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[]{0x2, 0, 0, 0};
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, TikaException, SAXException {
+ static {
+ StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR,
+ HEADER_2000_PROPERTIES_MARKER, 0);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, TikaException, SAXException {
// First up, which version of the format are we handling?
byte[] header = new byte[128];
IOUtils.readFully(stream, header);
@@ -115,21 +105,20 @@ public class DWGParser extends AbstractParser {
if (version.equals("AC1015")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipTo2000PropertyInfoSection(stream, header)) {
- get2000Props(stream,metadata,xhtml);
+ get2000Props(stream, metadata, xhtml);
}
} else if (version.equals("AC1018")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
- get2004Props(stream,metadata,xhtml);
+ get2004Props(stream, metadata, xhtml);
}
} else if (version.equals("AC1021") || version.equals("AC1024")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
- get2007and2010Props(stream,metadata,xhtml);
+ get2007and2010Props(stream, metadata, xhtml);
}
} else {
- throw new TikaException(
- "Unsupported AutoCAD drawing version: " + version);
+ throw new TikaException("Unsupported AutoCAD drawing version: " + version);
}
xhtml.endDocument();
@@ -138,10 +127,9 @@ public class DWGParser extends AbstractParser {
/**
* Stored as US-ASCII
*/
- private void get2004Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ private void get2004Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
- // Standard properties
+ // Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
String headerValue = read2004String(stream);
handleHeader(i, headerValue, metadata, xhtml);
@@ -150,33 +138,33 @@ public class DWGParser extends AbstractParser {
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
- String propName = read2004String(stream);
- String propValue = read2004String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
+ String propName = read2004String(stream);
+ String propValue = read2004String(stream);
+ if (propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
}
}
private String read2004String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
+ int stringLen = EndianUtils.readUShortLE(stream);
- byte[] stringData = new byte[stringLen];
- IOUtils.readFully(stream, stringData);
+ byte[] stringData = new byte[stringLen];
+ IOUtils.readFully(stream, stringData);
- // Often but not always null terminated
- if (stringData[stringLen-1] == 0) {
- stringLen--;
- }
- String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
- return value;
+ // Often but not always null terminated
+ if (stringData[stringLen - 1] == 0) {
+ stringLen--;
+ }
+ String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+ return value;
}
/**
* Stored as UCS2, so 16 bit "unicode"
*/
- private void get2007and2010Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ private void get2007and2010Props(InputStream stream, Metadata metadata,
+ XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
@@ -187,81 +175,79 @@ public class DWGParser extends AbstractParser {
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
- String propName = read2007and2010String(stream);
- String propValue = read2007and2010String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
+ String propName = read2007and2010String(stream);
+ String propValue = read2007and2010String(stream);
+ if (propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
}
}
private String read2007and2010String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
+ int stringLen = EndianUtils.readUShortLE(stream);
- byte[] stringData = new byte[stringLen * 2];
- IOUtils.readFully(stream, stringData);
- String value = StringUtil.getFromUnicodeLE(stringData);
+ byte[] stringData = new byte[stringLen * 2];
+ IOUtils.readFully(stream, stringData);
+ String value = StringUtil.getFromUnicodeLE(stringData);
- // Some strings are null terminated
- if(value.charAt(value.length()-1) == 0) {
- value = value.substring(0, value.length()-1);
- }
+ // Some strings are null terminated
+ if (value.charAt(value.length() - 1) == 0) {
+ value = value.substring(0, value.length() - 1);
+ }
- return value;
+ return value;
}
- private void get2000Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
int propCount = 0;
- while(propCount < 30) {
+ while (propCount < 30) {
int propIdx = EndianUtils.readUShortLE(stream);
int length = EndianUtils.readUShortLE(stream);
int valueType = stream.read();
-
- if(propIdx == 0x28) {
- // This one seems not to follow the pattern
- length = 0x19;
- } else if(propIdx == 90) {
- // We think this means the end of properties
- break;
+
+ if (propIdx == 0x28) {
+ // This one seems not to follow the pattern
+ length = 0x19;
+ } else if (propIdx == 90) {
+ // We think this means the end of properties
+ break;
}
byte[] value = new byte[length];
IOUtils.readFully(stream, value);
- if(valueType == 0x1e) {
+ if (valueType == 0x1e) {
// Normal string, good
String val = StringUtil.getFromCompressedUnicode(value, 0, length);
-
+
// Is it one we can look up by index?
- if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
- metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
- xhtml.element("p", val);
- } else if(propIdx == 0x012c) {
- int splitAt = val.indexOf('=');
- if(splitAt > -1) {
- String propName = val.substring(0, splitAt);
- String propVal = val.substring(splitAt+1);
- metadata.add(propName, propVal);
- }
+ if (propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+ metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+ xhtml.element("p", val);
+ } else if (propIdx == 0x012c) {
+ int splitAt = val.indexOf('=');
+ if (splitAt > -1) {
+ String propName = val.substring(0, splitAt);
+ String propVal = val.substring(splitAt + 1);
+ metadata.add(propName, propVal);
+ }
}
} else {
// No idea...
}
-
+
propCount++;
}
}
- private void handleHeader(
- int headerNumber, String value, Metadata metadata,
- XHTMLContentHandler xhtml) throws SAXException {
- if(value == null || value.length() == 0) {
+ private void handleHeader(int headerNumber, String value, Metadata metadata,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if (value == null || value.length() == 0) {
return;
}
Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
- if(headerProp != null) {
+ if (headerProp != null) {
metadata.set(headerProp, value);
}
@@ -275,20 +261,20 @@ public class DWGParser extends AbstractParser {
throws IOException, TikaException {
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
-
+
// Sanity check the offset. Some files seem to use a different format,
// and the offset isn't available at 0x20. Until we can work out how
// to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
- // Header should never be more than 10mb into the file, something is wrong
- offsetToSection = 0;
+ // Header should never be more than 10mb into the file, something is wrong
+ offsetToSection = 0;
}
-
+
// Work out how far to skip, and sanity check
long toSkip = offsetToSection - header.length;
- if(offsetToSection == 0){
+ if (offsetToSection == 0) {
return false;
- }
+ }
while (toSkip > 0) {
byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
IOUtils.readFully(stream, skip);
@@ -302,55 +288,55 @@ public class DWGParser extends AbstractParser {
*/
private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
throws IOException {
- int val = 0;
- while(val != -1) {
- val = stream.read();
- if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
- boolean going = true;
- for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
- val = stream.read();
- if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
- }
- if(going) {
- // Bingo, found it
- return true;
- }
- }
- }
- return false;
+ int val = 0;
+ while (val != -1) {
+ val = stream.read();
+ if (val == HEADER_2000_PROPERTIES_MARKER[0]) {
+ boolean going = true;
+ for (int i = 1; i < HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+ val = stream.read();
+ if (val != HEADER_2000_PROPERTIES_MARKER[i]) {
+ going = false;
+ }
+ }
+ if (going) {
+ // Bingo, found it
+ return true;
+ }
+ }
+ }
+ return false;
}
- private int skipToCustomProperties(InputStream stream)
- throws IOException, TikaException {
- // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
- byte[] padding = new byte[4];
- IOUtils.readFully(stream, padding);
- if((padding[0] == 0 && padding[1] == 0 &&
- padding[2] == 0 && padding[3] == 0) ||
- (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
- padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
- padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
- padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
-
- // Looks hopeful, skip on
- padding = new byte[CUSTOM_PROPERTIES_SKIP];
- IOUtils.readFully(stream, padding);
-
- // We should now have the count
- int count = EndianUtils.readUShortLE(stream);
-
- // Sanity check it
- if(count > 0 && count < 0x7f) {
- // Looks plausible
- return count;
- } else {
- // No properties / count is too high to trust
- return 0;
- }
- } else {
- // No padding. That probably means no custom props
- return 0;
- }
+ private int skipToCustomProperties(InputStream stream) throws IOException, TikaException {
+ // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
+ byte[] padding = new byte[4];
+ IOUtils.readFully(stream, padding);
+ if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && padding[3] == 0) ||
+ (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
+ padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+ padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+ padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+
+ // Looks hopeful, skip on
+ padding = new byte[CUSTOM_PROPERTIES_SKIP];
+ IOUtils.readFully(stream, padding);
+
+ // We should now have the count
+ int count = EndianUtils.readUShortLE(stream);
+
+ // Sanity check it
+ if (count > 0 && count < 0x7f) {
+ // Looks plausible
+ return count;
+ } else {
+ // No properties / count is too high to trust
+ return 0;
+ }
+ } else {
+ // No padding. That probably means no custom props
+ return 0;
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index a7c8bd0..dafe6f1 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.prt;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
@@ -23,6 +25,9 @@ import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
@@ -31,245 +36,247 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
/**
* A basic text extracting parser for the CADKey PRT (CAD Drawing)
- * format. It outputs text from note entries.
+ * format. It outputs text from note entries.
*/
public class PRTParser extends AbstractParser {
- /** Serial version UID */
- private static final long serialVersionUID = 4659638314375035178L;
-
- private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
public static final String PRT_MIME_TYPE = "application/x-prt";
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 4659638314375035178L;
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-prt"));
/**
* How long do we allow a text run to claim to be, before we
* decide we're confused and it's not really text after all?
*/
private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
-
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
/*
* Text types:
* 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
* 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
* (anything) e0 3f sz sz TEXT *view name*
- * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
- *
+ * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
+ *
* Note - all text is null terminated
*/
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- Last5 l5 = new Last5();
- int read;
-
- // Try to get the creation date, which is YYYYMMDDhhmm
- byte[] header = new byte[30];
- IOUtils.readFully(stream, header);
- byte[] date = new byte[12];
- IOUtils.readFully(stream, date);
-
- String dateStr = new String(date, US_ASCII);
- if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
- String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
- "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
- dateStr.substring(10, 12) + ":00";
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- // TODO Metadata.DATE is used as modified, should it be here?
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- }
- metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
-
- // The description, if set, is the next up-to-500 bytes
- byte[] desc = new byte[500];
- IOUtils.readFully(stream, desc);
- String description = extractText(desc, true);
- if(description.length() > 0) {
- metadata.set(TikaCoreProperties.DESCRIPTION, description);
- }
-
- // Now look for text
- while( (read = stream.read()) > -1) {
- if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
- int nread = stream.read();
- if(nread == 0x3f || nread == 0xbf) {
- // Looks promising, check back for a suitable value
- if(read == 0xe3 && nread == 0x3f) {
- if(l5.is33()) {
- // Bingo, note text
- handleNoteText(stream, xhtml);
- }
- } else if(l5.is00()) {
- // Likely view name
- handleViewName(read, nread, stream, xhtml, l5);
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ Last5 l5 = new Last5();
+ int read;
+
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(stream, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(stream, date);
+
+ String dateStr = new String(date, US_ASCII);
+ if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" +
+ dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ // TODO Metadata.DATE is used as modified, should it be here?
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(stream, desc);
+ String description = extractText(desc, true);
+ if (description.length() > 0) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ }
+
+ // Now look for text
+ while ((read = stream.read()) > -1) {
+ if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = stream.read();
+ if (nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if (read == 0xe3 && nread == 0x3f) {
+ if (l5.is33()) {
+ // Bingo, note text
+ handleNoteText(stream, xhtml);
+ }
+ } else if (l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, stream, xhtml, l5);
+ }
}
- }
- } else {
- l5.record(read);
- }
- }
- }
-
- private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- // Ensure we have the right padding text
- int read;
- for(int i=0; i<10; i++) {
- read = stream.read();
- if(read >= 0 && read <= 0x0f) {
- // Promising
- } else {
- // Wrong, false detection
- return;
- }
- }
- read = stream.read();
- if(read != 0x1f) {
- // Wrong, false detection
- return;
- }
-
- int length = EndianUtils.readUShortLE(stream);
- if(length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
- handleText(length, stream, xhtml);
- }
+ } else {
+ l5.record(read);
+ }
+ }
}
-
- private void handleViewName(int typeA, int typeB, InputStream stream,
- XHTMLContentHandler xhtml, Last5 l5)
- throws IOException, SAXException, TikaException {
- // Is it 8 byte zero padded?
- int maybeLength = EndianUtils.readUShortLE(stream);
- if(maybeLength == 0) {
- // Check the next 6 bytes too
- for(int i=0; i<6; i++) {
- int read = stream.read();
- if(read >= 0 && read <= 0x0f) {
+
+ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ // Ensure we have the right padding text
+ int read;
+ for (int i = 0; i < 10; i++) {
+ read = stream.read();
+ if (read >= 0 && read <= 0x0f) {
// Promising
- } else {
+ } else {
// Wrong, false detection
return;
- }
- }
-
- byte[] b2 = new byte[2];
- IOUtils.readFully(stream, b2);
- int length = EndianUtils.getUShortLE(b2);
- if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
- handleText(length, stream, xhtml);
- } else {
- // Was probably something else
- l5.record(b2[0]);
- l5.record(b2[1]);
- }
- } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
- // Looks like it's straight into the text
- handleText(maybeLength, stream, xhtml);
- }
+ }
+ }
+ read = stream.read();
+ if (read != 0x1f) {
+ // Wrong, false detection
+ return;
+ }
+
+ int length = EndianUtils.readUShortLE(stream);
+ if (length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ }
}
-
- private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- byte[] str = new byte[length];
- IOUtils.readFully(stream, str);
- if(str[length-1] != 0) {
- // Not properly null terminated, must be wrong
- return;
- }
-
- String text = extractText(str, false);
-
- xhtml.startElement("p");
- xhtml.characters(text);
- xhtml.endElement("p");
+
+ private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLContentHandler xhtml,
+ Last5 l5) throws IOException, SAXException, TikaException {
+ // Is it 8 byte zero padded?
+ int maybeLength = EndianUtils.readUShortLE(stream);
+ if (maybeLength == 0) {
+ // Check the next 6 bytes too
+ for (int i = 0; i < 6; i++) {
+ int read = stream.read();
+ if (read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+
+ byte[] b2 = new byte[2];
+ IOUtils.readFully(stream, b2);
+ int length = EndianUtils.getUShortLE(b2);
+ if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ } else {
+ // Was probably something else
+ l5.record(b2[0]);
+ l5.record(b2[1]);
+ }
+ } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+ // Looks like it's straight into the text
+ handleText(maybeLength, stream, xhtml);
+ }
+ }
+
+ private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ byte[] str = new byte[length];
+ IOUtils.readFully(stream, str);
+ if (str[length - 1] != 0) {
+ // Not properly null terminated, must be wrong
+ return;
+ }
+
+ String text = extractText(str, false);
+
+ xhtml.startElement("p");
+ xhtml.characters(text);
+ xhtml.endElement("p");
}
-
+
/**
* Does our best to turn the bytes into text
*/
private String extractText(byte[] data, boolean trim) throws TikaException {
- // The text is always stored null terminated, but sometimes
- // may have extra null padding too
- int length = data.length - 1;
- if(trim) {
- for(int i=0; i<data.length; i++) {
- if(data[i] == 0) {
- length = i;
- break;
- }
- }
- }
-
- // We believe that the text is basically stored as CP437
- // That said, there are a few characters slightly wrong for that...
- String text;
- try {
- text = new String(data, 0, length, "cp437");
- } catch(UnsupportedEncodingException e) {
- throw new TikaException("JVM Broken, core codepage CP437 missing!");
- }
-
- // Fix up the known character issues
- text = text.replace("\u03C6","\u00D8");
+ // The text is always stored null terminated, but sometimes
+ // may have extra null padding too
+ int length = data.length - 1;
+ if (trim) {
+ for (int i = 0; i < data.length; i++) {
+ if (data[i] == 0) {
+ length = i;
+ break;
+ }
+ }
+ }
+
+ // We believe that the text is basically stored as CP437
+ // That said, there are a few characters slightly wrong for that...
+ String text;
+ try {
+ text = new String(data, 0, length, "cp437");
+ } catch (UnsupportedEncodingException e) {
+ throw new TikaException("JVM Broken, core codepage CP437 missing!");
+ }
- // All done, as best as we can!
- return text;
+ // Fix up the known character issues
+ text = text.replace("\u03C6", "\u00D8");
+
+ // All done, as best as we can!
+ return text;
}
-
+
/**
* Provides a view on the previous 5 bytes
*/
private static class Last5 {
- byte[] data = new byte[5];
- int pos = 0;
-
- private void record(int b) {
- data[pos] = (byte)b;
- pos++;
- if(pos >= data.length) {
- pos = 0;
- }
- }
-
- private byte[] get() {
- byte[] ret = new byte[5];
- for(int i=0; i<ret.length; i++) {
- int p = pos - i;
- if(p < 0) { p += ret.length; }
- ret[i] = data[p];
- }
- return ret;
- }
-
- private boolean is33() {
- byte[] last5 = get();
- for(byte b : last5) {
- if(b != 0x33) return false;
- }
- return true;
- }
-
- private boolean is00() {
- byte[] last5 = get();
- for(byte b : last5) {
- if(b != 0x00) return false;
- }
- return true;
- }
+ byte[] data = new byte[5];
+ int pos = 0;
+
+ private void record(int b) {
+ data[pos] = (byte) b;
+ pos++;
+ if (pos >= data.length) {
+ pos = 0;
+ }
+ }
+
+ private byte[] get() {
+ byte[] ret = new byte[5];
+ for (int i = 0; i < ret.length; i++) {
+ int p = pos - i;
+ if (p < 0) {
+ p += ret.length;
+ }
+ ret[i] = data[p];
+ }
+ return ret;
+ }
+
+ private boolean is33() {
+ byte[] last5 = get();
+ for (byte b : last5) {
+ if (b != 0x33) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean is00() {
+ byte[] last5 = get();
+ for (byte b : last5) {
+ if (b != 0x00) {
+ return false;
+ }
+ }
+ return true;
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 56b2787..f3a5412 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -16,87 +16,85 @@
*/
package org.apache.tika.parser.dwg;
+import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
-import static org.apache.tika.TikaTest.assertContains;
import java.io.InputStream;
import java.util.Arrays;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
public class DWGParserTest {
-
+
@Test
public void testDWG2000Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2000.dwg");
+ InputStream input =
+ DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2000.dwg");
testParserAlt(input);
}
@Test
public void testDWG2004Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2004.dwg");
+ InputStream input =
+ DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2004.dwg");
testParser(input);
}
@Test
public void testDWG2004ParserNoHeaderAddress() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2004_no_header.dwg");
+ InputStream input = DWGParserTest.class
+ .getResourceAsStream("/test-documents/testDWG2004_no_header.dwg");
testParserNoHeader(input);
}
@Test
public void testDWG2007Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2007.dwg");
+ InputStream input =
+ DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2007.dwg");
testParser(input);
}
@Test
public void testDWG2010Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010.dwg");
+ InputStream input =
+ DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010.dwg");
testParser(input);
}
-
+
@Test
public void testDWG2010CustomPropertiesParser() throws Exception {
// Check that standard parsing works
- InputStream testInput = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010_custom_props.dwg");
+ InputStream testInput = DWGParserTest.class
+ .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg");
testParser(testInput);
-
+
// Check that custom properties with alternate padding work
- try (InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010_custom_props.dwg")) {
+ try (InputStream input = DWGParserTest.class
+ .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new DWGParser().parse(input, handler, metadata, null);
- assertEquals("valueforcustomprop1",
- metadata.get("customprop1"));
- assertEquals("valueforcustomprop2",
- metadata.get("customprop2"));
+ assertEquals("valueforcustomprop1", metadata.get("customprop1"));
+ assertEquals("valueforcustomprop2", metadata.get("customprop2"));
}
}
@Test
public void testDWGMechParser() throws Exception {
- String[] types = new String[] {
- "6", "2004", "2004DX", "2005", "2006",
- "2007", "2008", "2009", "2010", "2011"
- };
+ String[] types =
+ new String[]{"6", "2004", "2004DX", "2005", "2006", "2007", "2008", "2009", "2010",
+ "2011"};
for (String type : types) {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWGmech"+type+".dwg");
- testParserAlt(input);
+ InputStream input = DWGParserTest.class
+ .getResourceAsStream("/test-documents/testDWGmech" + type + ".dwg");
+ testParserAlt(input);
}
}
@@ -109,18 +107,15 @@ public class DWGParserTest {
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("The quick brown fox jumps over the lazy dog",
+ assertEquals("The quick brown fox jumps over the lazy dog",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("Gym class featuring a brown fox and lazy dog",
metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Nevin Nollop",
- metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
assertContains("Pangram, fox, dog",
- Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
- assertEquals("Lorem ipsum",
- metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
- assertEquals("http://www.alfresco.com",
- metadata.get(TikaCoreProperties.RELATION));
+ Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
+ assertEquals("Lorem ipsum", metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
+ assertEquals("http://www.alfresco.com", metadata.get(TikaCoreProperties.RELATION));
String content = handler.toString();
assertContains("The quick brown fox jumps over the lazy dog", content);
@@ -139,7 +134,7 @@ public class DWGParserTest {
new DWGParser().parse(input, handler, metadata);
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
+
assertNull(metadata.get(TikaCoreProperties.TITLE));
assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
assertNull(metadata.get(TikaCoreProperties.CREATOR));
@@ -163,22 +158,14 @@ public class DWGParserTest {
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Test Title",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Subject",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("My Author",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("My keyword1, MyKeyword2",
- metadata.get(TikaCoreProperties.SUBJECT));
- assertEquals("This is a comment",
- metadata.get(TikaCoreProperties.COMMENTS));
- assertEquals("bejanpol",
- metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("http://mycompany/drawings",
- metadata.get(TikaCoreProperties.RELATION));
- assertEquals("MyCustomPropertyValue",
- metadata.get("MyCustomProperty"));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Subject", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("My Author", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("My keyword1, MyKeyword2", metadata.get(TikaCoreProperties.SUBJECT));
+ assertEquals("This is a comment", metadata.get(TikaCoreProperties.COMMENTS));
+ assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("http://mycompany/drawings", metadata.get(TikaCoreProperties.RELATION));
+ assertEquals("MyCustomPropertyValue", metadata.get("MyCustomProperty"));
String content = handler.toString();
assertContains("This is a comment", content);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
index 53e95a6..0a2db65 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
@@ -20,12 +20,13 @@ import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
public class PRTParserTest extends TikaTest {
/**
@@ -33,34 +34,33 @@ public class PRTParserTest extends TikaTest {
*/
@Test
public void testPRTParserBasics() throws Exception {
- try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new PRTParser().parse(input, handler, metadata);
-
- assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
- // This file has a date
- assertEquals("2011-06-20T16:54:00",
- metadata.get(TikaCoreProperties.CREATED));
-
- // But no description
- assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
-
- String contents = handler.toString();
-
- assertContains("Front View", contents);
- assertContains("Back View", contents);
- assertContains("Bottom View", contents);
- assertContains("Right View", contents);
- assertContains("Left View", contents);
- //assertContains("Isometric View", contents); // Can't detect yet
- assertContains("Axonometric View", contents);
-
- assertContains("You've managed to extract all the text!", contents);
- assertContains("This is more text", contents);
- assertContains("Text Inside a PRT file", contents);
- }
+ try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // This file has a date
+ assertEquals("2011-06-20T16:54:00", metadata.get(TikaCoreProperties.CREATED));
+
+ // But no description
+ assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ String contents = handler.toString();
+
+ assertContains("Front View", contents);
+ assertContains("Back View", contents);
+ assertContains("Bottom View", contents);
+ assertContains("Right View", contents);
+ assertContains("Left View", contents);
+ //assertContains("Isometric View", contents); // Can't detect yet
+ assertContains("Axonometric View", contents);
+
+ assertContains("You've managed to extract all the text!", contents);
+ assertContains("This is more text", contents);
+ assertContains("Text Inside a PRT file", contents);
+ }
}
/**
@@ -68,43 +68,42 @@ public class PRTParserTest extends TikaTest {
*/
@Test
public void testPRTParserComplex() throws Exception {
- try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new PRTParser().parse(input, handler, metadata);
-
- assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
- // File has both a date and a description
- assertEquals("1997-04-01T08:59:00",
- metadata.get(TikaCoreProperties.CREATED));
- assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
- metadata.get(TikaCoreProperties.DESCRIPTION));
-
- String contents = handler.toString();
-
- assertContains("ITEM", contents);
- assertContains("REQ.", contents);
- assertContains("DESCRIPTION", contents);
- assertContains("MAT'L", contents);
- assertContains("TOLERANCES UNLESS", contents);
- assertContains("FRACTIONS", contents);
- assertContains("ANGLES", contents);
- assertContains("Acme Corporation", contents);
-
- assertContains("DATE", contents);
- assertContains("CHANGE", contents);
- assertContains("DRAWN BY", contents);
- assertContains("SCALE", contents);
- assertContains("TIKA TEST DRAWING", contents);
- assertContains("TIKA LETTERS", contents);
- assertContains("5.82", contents);
- assertContains("112" + '\u00b0', contents); // Degrees
- assertContains("TIKA TEST LETTER", contents);
- assertContains("17.11", contents);
- assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
- assertContains("Diameter", contents);
- assertContains("The Apache Tika toolkit", contents);
- }
+ try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // File has both a date and a description
+ assertEquals("1997-04-01T08:59:00", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ String contents = handler.toString();
+
+ assertContains("ITEM", contents);
+ assertContains("REQ.", contents);
+ assertContains("DESCRIPTION", contents);
+ assertContains("MAT'L", contents);
+ assertContains("TOLERANCES UNLESS", contents);
+ assertContains("FRACTIONS", contents);
+ assertContains("ANGLES", contents);
+ assertContains("Acme Corporation", contents);
+
+ assertContains("DATE", contents);
+ assertContains("CHANGE", contents);
+ assertContains("DRAWN BY", contents);
+ assertContains("SCALE", contents);
+ assertContains("TIKA TEST DRAWING", contents);
+ assertContains("TIKA LETTERS", contents);
+ assertContains("5.82", contents);
+ assertContains("112" + '\u00b0', contents); // Degrees
+ assertContains("TIKA TEST LETTER", contents);
+ assertContains("17.11", contents);
+ assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
+ assertContains("Diameter", contents);
+ assertContains("The Apache Tika toolkit", contents);
+ }
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
index 481046f..47f6d0c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
@@ -21,33 +21,34 @@ import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* Parser for Java .class files.
*/
public class ClassParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -3531388963354454357L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("java-vm"));
+ Collections.singleton(MediaType.application("java-vm"));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
new XHTMLClassVisitor(handler, metadata).parse(stream);
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 4b6c53f..199d5ca 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -19,10 +19,6 @@ package org.apache.tika.parser.asm;
import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.objectweb.asm.AnnotationVisitor;
import org.objectweb.asm.Attribute;
import org.objectweb.asm.ClassReader;
@@ -34,6 +30,11 @@ import org.objectweb.asm.Type;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+
/**
* Class visitor that generates XHTML SAX events to describe the
* contents of the visited class.
@@ -54,8 +55,11 @@ class XHTMLClassVisitor extends ClassVisitor {
this.metadata = metadata;
}
- public void parse(InputStream stream)
- throws TikaException, SAXException, IOException {
+ private static boolean isSet(int value, int flag) {
+ return (value & flag) != 0;
+ }
+
+ public void parse(InputStream stream) throws TikaException, SAXException, IOException {
try {
ClassReader reader = new ClassReader(stream);
reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
@@ -68,9 +72,8 @@ class XHTMLClassVisitor extends ClassVisitor {
}
}
- public void visit(
- int version, int access, String name, String signature,
- String superName, String[] interfaces) {
+ public void visit(int version, int access, String name, String signature, String superName,
+ String[] interfaces) {
type = Type.getObjectType(name);
String className = type.getClassName();
@@ -126,8 +129,7 @@ class XHTMLClassVisitor extends ClassVisitor {
}
}
- private void writeInterfaces(String keyword, String[] interfaces)
- throws SAXException {
+ private void writeInterfaces(String keyword, String[] interfaces) throws SAXException {
if (interfaces != null && interfaces.length > 0) {
writeKeyword(keyword);
String separator = " ";
@@ -162,7 +164,6 @@ class XHTMLClassVisitor extends ClassVisitor {
public void visitSource(String source, String debug) {
}
-
/**
* Ignored.
*/
@@ -179,16 +180,14 @@ class XHTMLClassVisitor extends ClassVisitor {
/**
* Ignored.
*/
- public void visitInnerClass(
- String name, String outerName, String innerName, int access) {
+ public void visitInnerClass(String name, String outerName, String innerName, int access) {
}
/**
* Visits a field.
*/
- public FieldVisitor visitField(
- int access, String name, String desc, String signature,
- Object value) {
+ public FieldVisitor visitField(int access, String name, String desc, String signature,
+ Object value) {
if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
try {
xhtml.characters(" ");
@@ -215,9 +214,8 @@ class XHTMLClassVisitor extends ClassVisitor {
/**
* Visits a method.
*/
- public MethodVisitor visitMethod(
- int access, String name, String desc, String signature,
- String[] exceptions) {
+ public MethodVisitor visitMethod(int access, String name, String desc, String signature,
+ String[] exceptions) {
if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
try {
xhtml.characters(" ");
@@ -297,8 +295,7 @@ class XHTMLClassVisitor extends ClassVisitor {
writeAccess(access, Opcodes.ACC_NATIVE, "native");
}
- private void writeAccess(int access, int code, String keyword)
- throws SAXException {
+ private void writeAccess(int access, int code, String keyword) throws SAXException {
if (isSet(access, code)) {
writeKeyword(keyword);
xhtml.characters(" ");
@@ -316,8 +313,4 @@ class XHTMLClassVisitor extends ClassVisitor {
}
}
- private static boolean isSet(int value, int flag) {
- return (value & flag) != 0;
- }
-
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index 162b1be..00b39c2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -33,6 +33,12 @@ import java.util.regex.Pattern;
import com.uwyn.jhighlight.renderer.Renderer;
import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
@@ -41,11 +47,6 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
/**
* Generic Source code parser for Java, Groovy, C++.
@@ -60,15 +61,16 @@ public class SourceCodeParser extends AbstractEncodingDetectorParser {
private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
- private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
- private static final long serialVersionUID = -741976157563751152L;
+ private static final Map<MediaType, String> TYPES_TO_RENDERER =
+ new HashMap<MediaType, String>() {
+ private static final long serialVersionUID = -741976157563751152L;
- {
- put(MediaType.text("x-c++src"), CPP);
- put(MediaType.text("x-java-source"), JAVA);
- put(MediaType.text("x-groovy"), GROOVY);
- }
- };
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
//Parse the HTML document
private static final Schema HTML_SCHEMA = new HTMLSchema();
@@ -87,10 +89,10 @@ public class SourceCodeParser extends AbstractEncodingDetectorParser {
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- try (AutoDetectReader reader = new AutoDetectReader(
- new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+ metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
index 7050795..b863abd 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
@@ -25,6 +25,9 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.MachineMetadata;
@@ -34,14 +37,14 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* Parser for executable files. Currently supports ELF and PE
*/
public class ExecutableParser extends AbstractParser implements MachineMetadata {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 32128791892482l;
private static final MediaType PE_EXE = MediaType.application("x-msdownload");
@@ -50,35 +53,32 @@ public class ExecutableParser extends AbstractParser implements MachineMetadata
private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- PE_EXE,
- ELF_GENERAL,
- ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
- )));
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<MediaType>(
+ Arrays.asList(PE_EXE, ELF_GENERAL, ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB,
+ ELF_COREDUMP)));
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
// We only do metadata, for now
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// What kind is it?
byte[] first4 = new byte[4];
IOUtils.readFully(stream, first4);
-
- if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
- parsePE(xhtml, metadata, stream, first4);
- } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
- first4[2] == (byte)'L' && first4[3] == (byte)'F') {
- parseELF(xhtml, metadata, stream, first4);
+
+ if (first4[0] == (byte) 'M' && first4[1] == (byte) 'Z') {
+ parsePE(xhtml, metadata, stream, first4);
+ } else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' &&
+ first4[3] == (byte) 'F') {
+ parseELF(xhtml, metadata, stream, first4);
}
-
-
+
+
// Finish everything
xhtml.endDocument();
}
@@ -86,322 +86,322 @@ public class ExecutableParser extends AbstractParser implements MachineMetadata
/**
* Parses a DOS or Windows PE file
*/
- public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
- InputStream stream, byte[] first4) throws TikaException, IOException {
- metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
- metadata.set(PLATFORM, PLATFORM_WINDOWS);
-
- // Skip over the MS-DOS bit
- byte[] msdosSection = new byte[0x3c-4];
- IOUtils.readFully(stream, msdosSection);
-
- // Grab the PE header offset
- int peOffset = EndianUtils.readIntLE(stream);
-
- // Sanity check - while it may go anywhere, it's normally in the first few kb
- if (peOffset > 4096 || peOffset < 0x3f) return;
-
- // Skip the rest of the MS-DOS stub (if PE), until we reach what should
- // be the PE header (if this is a PE executable)
- stream.skip(peOffset - 0x40);
-
- // Read the PE header
- byte[] pe = new byte[24];
- IOUtils.readFully(stream, pe);
-
- // Check it really is a PE header
- if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
- // Good, has a valid PE signature
- } else {
- // Old style MS-DOS
- return;
- }
-
- // Read the header values
- int machine = EndianUtils.getUShortLE(pe, 4);
- int numSectors = EndianUtils.getUShortLE(pe, 6);
- long createdAt = EndianUtils.getIntLE(pe, 8);
- long symbolTableOffset = EndianUtils.getIntLE(pe, 12);
- long numSymbols = EndianUtils.getIntLE(pe, 16);
- int sizeOptHdrs = EndianUtils.getUShortLE(pe, 20);
- int characteristcs = EndianUtils.getUShortLE(pe, 22);
-
- // Turn this into helpful metadata
- Date createdAtD = new Date(createdAt*1000l);
- metadata.set(TikaCoreProperties.CREATED, createdAtD);
-
- switch(machine) {
- case 0x14c:
- metadata.set(MACHINE_TYPE, MACHINE_x86_32);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
- case 0x8664:
- metadata.set(MACHINE_TYPE, MACHINE_x86_32);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "64");
- break;
- case 0x200:
- metadata.set(MACHINE_TYPE, MACHINE_IA_64);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "64");
- break;
-
- case 0x184:
- metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
- case 0x284:
- metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "64");
- break;
-
- case 0x1c0:
- case 0x1c4:
- metadata.set(MACHINE_TYPE, MACHINE_ARM);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
+ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
+ byte[] first4) throws TikaException, IOException {
+ metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
+ metadata.set(PLATFORM, PLATFORM_WINDOWS);
- case 0x268:
- metadata.set(MACHINE_TYPE, MACHINE_M68K);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
+ // Skip over the MS-DOS bit
+ byte[] msdosSection = new byte[0x3c - 4];
+ IOUtils.readFully(stream, msdosSection);
- case 0x266:
- case 0x366:
- case 0x466:
- metadata.set(MACHINE_TYPE, MACHINE_MIPS);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "16");
- break;
- case 0x162:
- case 0x166:
- case 0x168:
- case 0x169:
- metadata.set(MACHINE_TYPE, MACHINE_MIPS);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "16");
- break;
-
- case 0x1f0:
- case 0x1f1:
- metadata.set(MACHINE_TYPE, MACHINE_PPC);
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
-
- case 0x1a2:
- case 0x1a3:
- metadata.set(MACHINE_TYPE, MACHINE_SH3);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
- case 0x1a6:
- metadata.set(MACHINE_TYPE, MACHINE_SH4);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
- case 0x1a8:
- metadata.set(MACHINE_TYPE, MACHINE_SH3);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
+ // Grab the PE header offset
+ int peOffset = EndianUtils.readIntLE(stream);
- case 0x9041:
- metadata.set(MACHINE_TYPE, MACHINE_M32R);
- metadata.set(ENDIAN, Endian.BIG.getName());
- metadata.set(ARCHITECTURE_BITS, "32");
- break;
+ // Sanity check - while it may go anywhere, it's normally in the first few kb
+ if (peOffset > 4096 || peOffset < 0x3f) {
+ return;
+ }
+
+ // Skip the rest of the MS-DOS stub (if PE), until we reach what should
+ // be the PE header (if this is a PE executable)
+ stream.skip(peOffset - 0x40);
+
+ // Read the PE header
+ byte[] pe = new byte[24];
+ IOUtils.readFully(stream, pe);
+
+ // Check it really is a PE header
+ if (pe[0] == (byte) 'P' && pe[1] == (byte) 'E' && pe[2] == 0 && pe[3] == 0) {
+ // Good, has a valid PE signature
+ } else {
+ // Old style MS-DOS
+ return;
+ }
+
+ // Read the header values
+ int machine = EndianUtils.getUShortLE(pe, 4);
+ int numSectors = EndianUtils.getUShortLE(pe, 6);
+ long createdAt = EndianUtils.getIntLE(pe, 8);
+ long symbolTableOffset = EndianUtils.getIntLE(pe, 12);
+ long numSymbols = EndianUtils.getIntLE(pe, 16);
+ int sizeOptHdrs = EndianUtils.getUShortLE(pe, 20);
+ int characteristcs = EndianUtils.getUShortLE(pe, 22);
+
+ // Turn this into helpful metadata
+ Date createdAtD = new Date(createdAt * 1000l);
+ metadata.set(TikaCoreProperties.CREATED, createdAtD);
- case 0xebc:
- metadata.set(MACHINE_TYPE, MACHINE_EFI);
- break;
+ switch (machine) {
+ case 0x14c:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x8664:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
+ case 0x200:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
- default:
- metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
- break;
- }
+ case 0x184:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x284:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
+
+ case 0x1c0:
+ case 0x1c4:
+ metadata.set(MACHINE_TYPE, MACHINE_ARM);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x268:
+ metadata.set(MACHINE_TYPE, MACHINE_M68K);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x266:
+ case 0x366:
+ case 0x466:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "16");
+ break;
+ case 0x162:
+ case 0x166:
+ case 0x168:
+ case 0x169:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "16");
+ break;
+
+ case 0x1f0:
+ case 0x1f1:
+ metadata.set(MACHINE_TYPE, MACHINE_PPC);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x1a2:
+ case 0x1a3:
+ metadata.set(MACHINE_TYPE, MACHINE_SH3);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x1a6:
+ metadata.set(MACHINE_TYPE, MACHINE_SH4);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x1a8:
+ metadata.set(MACHINE_TYPE, MACHINE_SH3);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x9041:
+ metadata.set(MACHINE_TYPE, MACHINE_M32R);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0xebc:
+ metadata.set(MACHINE_TYPE, MACHINE_EFI);
+ break;
+
+ default:
+ metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
+ break;
+ }
}
/**
* Parses a Unix ELF file
*/
- public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
- InputStream stream, byte[] first4) throws TikaException, IOException {
- // Byte 5 is the architecture
- int architecture = stream.read();
- if (architecture == 1) {
- metadata.set(ARCHITECTURE_BITS, "32");
- } else if (architecture == 2) {
- metadata.set(ARCHITECTURE_BITS, "64");
- }
-
- // Byte 6 is the endian-ness
- int endian = stream.read();
- if (endian == 1) {
- metadata.set(ENDIAN, Endian.LITTLE.getName());
- } else if (endian == 2) {
- metadata.set(ENDIAN, Endian.BIG.getName());
- }
-
- // Byte 7 is the elf version
- int elfVer = stream.read();
-
- // Byte 8 is the OS, if set (lots of compilers don't)
- // Byte 9 is the OS (specific) ABI version
- int os = stream.read();
- int osVer = stream.read();
- if (os > 0 || osVer > 0)
- {
- switch (os) {
- case 0:
- metadata.set(PLATFORM, PLATFORM_SYSV);
- break;
-
- case 1:
- metadata.set(PLATFORM, PLATFORM_HPUX);
- break;
-
- case 2:
- metadata.set(PLATFORM, PLATFORM_NETBSD);
- break;
-
- case 3:
- metadata.set(PLATFORM, PLATFORM_LINUX);
- break;
-
- case 6:
- metadata.set(PLATFORM, PLATFORM_SOLARIS);
- break;
-
- case 7:
- metadata.set(PLATFORM, PLATFORM_AIX);
- break;
-
- case 8:
- metadata.set(PLATFORM, PLATFORM_IRIX);
- break;
-
- case 9:
- metadata.set(PLATFORM, PLATFORM_FREEBSD);
- break;
-
- case 10:
- metadata.set(PLATFORM, PLATFORM_TRU64);
- break;
-
- case 12:
- metadata.set(PLATFORM, PLATFORM_FREEBSD);
- break;
-
- case 64:
- case 97:
- metadata.set(PLATFORM, PLATFORM_ARM);
- break;
-
- case 255:
- metadata.set(PLATFORM, PLATFORM_EMBEDDED);
- break;
- }
- }
-
- // Bytes 10-16 are padding and lengths
- byte[] padLength = new byte[7];
- IOUtils.readFully(stream, padLength);
-
- // Bytes 16-17 are the object type (LE/BE)
- int type;
- if (endian == 1) {
- type = EndianUtils.readUShortLE(stream);
- } else {
- type = EndianUtils.readUShortBE(stream);
- }
- switch(type) {
- case 1:
- metadata.set(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
- break;
-
- case 2:
- metadata.set(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
- break;
-
- case 3:
- metadata.set(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
- break;
-
- case 4:
- metadata.set(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
- break;
-
- default:
- metadata.set(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
- break;
- }
-
- // Bytes 18-19 are the machine (EM_*)
- int machine;
- if (endian == 1) {
- machine = EndianUtils.readUShortLE(stream);
- } else {
- machine = EndianUtils.readUShortBE(stream);
- }
- switch(machine) {
- case 2:
- case 18:
- case 43:
- metadata.set(MACHINE_TYPE, MACHINE_SPARC);
- break;
- case 3:
- metadata.set(MACHINE_TYPE, MACHINE_x86_32);
- break;
- case 4:
- metadata.set(MACHINE_TYPE, MACHINE_M68K);
- break;
- case 5:
- metadata.set(MACHINE_TYPE, MACHINE_M88K);
- break;
- case 8:
- case 10:
- metadata.set(MACHINE_TYPE, MACHINE_MIPS);
- break;
- case 7:
- metadata.set(MACHINE_TYPE, MACHINE_S370);
- break;
- case 20:
- case 21:
- metadata.set(MACHINE_TYPE, MACHINE_PPC);
- break;
- case 22:
- metadata.set(MACHINE_TYPE, MACHINE_S390);
- break;
- case 40:
- metadata.set(MACHINE_TYPE, MACHINE_ARM);
- break;
- case 41:
- case 0x9026:
- metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
- break;
- case 50:
- metadata.set(MACHINE_TYPE, MACHINE_IA_64);
- break;
- case 62:
- metadata.set(MACHINE_TYPE, MACHINE_x86_64);
- break;
- case 75:
- metadata.set(MACHINE_TYPE, MACHINE_VAX);
- break;
- case 88:
- metadata.set(MACHINE_TYPE, MACHINE_M32R);
- break;
- }
-
-
-
- // Bytes 20-23 are the version
- // TODO
+ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
+ byte[] first4) throws TikaException, IOException {
+ // Byte 5 is the architecture
+ int architecture = stream.read();
+ if (architecture == 1) {
+ metadata.set(ARCHITECTURE_BITS, "32");
+ } else if (architecture == 2) {
+ metadata.set(ARCHITECTURE_BITS, "64");
+ }
+
+ // Byte 6 is the endian-ness
+ int endian = stream.read();
+ if (endian == 1) {
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ } else if (endian == 2) {
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ }
+
+ // Byte 7 is the elf version
+ int elfVer = stream.read();
+
+ // Byte 8 is the OS, if set (lots of compilers don't)
+ // Byte 9 is the OS (specific) ABI version
+ int os = stream.read();
+ int osVer = stream.read();
+ if (os > 0 || osVer > 0) {
+ switch (os) {
+ case 0:
+ metadata.set(PLATFORM, PLATFORM_SYSV);
+ break;
+
+ case 1:
+ metadata.set(PLATFORM, PLATFORM_HPUX);
+ break;
+
+ case 2:
+ metadata.set(PLATFORM, PLATFORM_NETBSD);
+ break;
+
+ case 3:
+ metadata.set(PLATFORM, PLATFORM_LINUX);
+ break;
+
+ case 6:
+ metadata.set(PLATFORM, PLATFORM_SOLARIS);
+ break;
+
+ case 7:
+ metadata.set(PLATFORM, PLATFORM_AIX);
+ break;
+
+ case 8:
+ metadata.set(PLATFORM, PLATFORM_IRIX);
+ break;
+
+ case 9:
+ metadata.set(PLATFORM, PLATFORM_FREEBSD);
+ break;
+
+ case 10:
+ metadata.set(PLATFORM, PLATFORM_TRU64);
+ break;
+
+ case 12:
+ metadata.set(PLATFORM, PLATFORM_FREEBSD);
+ break;
+
+ case 64:
+ case 97:
+ metadata.set(PLATFORM, PLATFORM_ARM);
+ break;
+
+ case 255:
+ metadata.set(PLATFORM, PLATFORM_EMBEDDED);
+ break;
+ }
+ }
+
+ // Bytes 10-16 are padding and lengths
+ byte[] padLength = new byte[7];
+ IOUtils.readFully(stream, padLength);
+
+ // Bytes 16-17 are the object type (LE/BE)
+ int type;
+ if (endian == 1) {
+ type = EndianUtils.readUShortLE(stream);
+ } else {
+ type = EndianUtils.readUShortBE(stream);
+ }
+ switch (type) {
+ case 1:
+ metadata.set(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
+ break;
+
+ case 2:
+ metadata.set(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
+ break;
+
+ case 3:
+ metadata.set(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
+ break;
+
+ case 4:
+ metadata.set(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
+ break;
+
+ default:
+ metadata.set(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
+ break;
+ }
+
+ // Bytes 18-19 are the machine (EM_*)
+ int machine;
+ if (endian == 1) {
+ machine = EndianUtils.readUShortLE(stream);
+ } else {
+ machine = EndianUtils.readUShortBE(stream);
+ }
+ switch (machine) {
+ case 2:
+ case 18:
+ case 43:
+ metadata.set(MACHINE_TYPE, MACHINE_SPARC);
+ break;
+ case 3:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ break;
+ case 4:
+ metadata.set(MACHINE_TYPE, MACHINE_M68K);
+ break;
+ case 5:
+ metadata.set(MACHINE_TYPE, MACHINE_M88K);
+ break;
+ case 8:
+ case 10:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ break;
+ case 7:
+ metadata.set(MACHINE_TYPE, MACHINE_S370);
+ break;
+ case 20:
+ case 21:
+ metadata.set(MACHINE_TYPE, MACHINE_PPC);
+ break;
+ case 22:
+ metadata.set(MACHINE_TYPE, MACHINE_S390);
+ break;
+ case 40:
+ metadata.set(MACHINE_TYPE, MACHINE_ARM);
+ break;
+ case 41:
+ case 0x9026:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ break;
+ case 50:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ break;
+ case 62:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_64);
+ break;
+ case 75:
+ metadata.set(MACHINE_TYPE, MACHINE_VAX);
+ break;
+ case 88:
+ metadata.set(MACHINE_TYPE, MACHINE_M32R);
+ break;
+ }
+
+
+ // Bytes 20-23 are the version
+ // TODO
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
index 2db8bef..fef6959 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.mat;
//JDK imports
+
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
@@ -29,6 +30,9 @@ import com.jmatio.io.MatFileHeader;
import com.jmatio.io.MatFileReader;
import com.jmatio.types.MLArray;
import com.jmatio.types.MLStructure;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -37,37 +41,33 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
//JMatIO imports
public class MatParser extends AbstractParser {
+ public static final String MATLAB_MIME_TYPE = "application/x-matlab-data";
+
static {
//make sure that this is set to false
MatFileReader.setAllowObjectDeserialization(false);
}
- public static final String MATLAB_MIME_TYPE =
- "application/x-matlab-data";
-
private final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-matlab-data"));
- public Set<MediaType> getSupportedTypes(ParseContext context){
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
//Set MIME type as Matlab
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
TemporaryResources tmp =
- TikaInputStream.isTikaInputStream(stream) ? null :
- new TemporaryResources();
+ TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
try {
// Use TIS so we can spool a temp file for parsing.
TikaInputStream tis = TikaInputStream.get(stream, tmp);
@@ -77,8 +77,10 @@ public class MatParser extends AbstractParser {
MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information
- // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
- String[] parts = hdr.getDescription().split(","); // Break header information into its parts
+ // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2
+ // 23:41:57 2014"
+ String[] parts =
+ hdr.getDescription().split(","); // Break header information into its parts
if (parts[2].contains("Created")) {
int lastIndex1 = parts[2].lastIndexOf("Created on:");
@@ -89,7 +91,7 @@ public class MatParser extends AbstractParser {
if (parts[1].contains("Platform")) {
int lastIndex2 = parts[1].lastIndexOf("Platform:");
String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
- metadata.set("platform" , platform);
+ metadata.set("platform", platform);
}
if (parts[0].contains("MATLAB")) {
@@ -97,11 +99,13 @@ public class MatParser extends AbstractParser {
}
// Get endian indicator from header file
- String endianBytes = new String(hdr.getEndianIndicator(), UTF_8); // Retrieve endian bytes and convert to string
- String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to characters to string
+ String endianBytes = new String(hdr.getEndianIndicator(),
+ UTF_8); // Retrieve endian bytes and convert to string
+ String endianCode = String.valueOf(
+ endianBytes.toCharArray()); // Convert bytes to characters to string
metadata.set("endian", endianCode);
- //Text output
+ //Text output
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
@@ -113,16 +117,16 @@ public class MatParser extends AbstractParser {
xhtml.element("p", varName + ":" + String.valueOf(varData));
// If the variable is a structure, extract variable info from structure
- if (varData.isStruct()){
+ if (varData.isStruct()) {
MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
xhtml.startElement("ul");
xhtml.newline();
- for (MLArray element : mlStructure.getAllFields()){
+ for (MLArray element : mlStructure.getAllFields()) {
xhtml.startElement("li");
xhtml.characters(String.valueOf(element));
// If there is an embedded structure, extract variable info.
- if (element.isStruct()){
+ if (element.isStruct()) {
xhtml.startElement("ul");
// Should this actually be a recursive call?
xhtml.element("li", element.contentToString());
@@ -143,4 +147,4 @@ public class MatParser extends AbstractParser {
}
}
}
-}
\ No newline at end of file
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index a065358..4740d6c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -24,6 +24,14 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import com.epam.parso.Column;
+import com.epam.parso.DataWriterUtil;
+import com.epam.parso.SasFileProperties;
+import com.epam.parso.SasFileReader;
+import com.epam.parso.impl.SasFileReaderImpl;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.HttpHeaders;
@@ -36,26 +44,16 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import com.epam.parso.Column;
-import com.epam.parso.DataWriterUtil;
-import com.epam.parso.SasFileProperties;
-import com.epam.parso.SasFileReader;
-import com.epam.parso.impl.SasFileReaderImpl;
/**
- * Processes the SAS7BDAT data columnar database file used by SAS and
- * other similar languages.
+ * Processes the SAS7BDAT data columnar database file used by SAS and
+ * other similar languages.
*/
public class SAS7BDATParser extends AbstractParser {
private static final long serialVersionUID = -2775485539937983150L;
-
- private static final MediaType TYPE_SAS7BDAT =
- MediaType.application("x-sas-data");
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(TYPE_SAS7BDAT);
+
+ private static final MediaType TYPE_SAS7BDAT = MediaType.application("x-sas-data");
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(TYPE_SAS7BDAT);
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -63,14 +61,13 @@ public class SAS7BDATParser extends AbstractParser {
}
@Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, TYPE_SAS7BDAT.toString());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
+
SasFileReader sas = new SasFileReaderImpl(stream);
SasFileProperties props = sas.getSasFileProperties();
@@ -79,20 +76,19 @@ public class SAS7BDATParser extends AbstractParser {
metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());
- metadata.set(PagedText.N_PAGES, (int)props.getPageCount());
- metadata.set(Database.COLUMN_COUNT, (int)props.getColumnsCount());
- metadata.set(Database.ROW_COUNT, (int)props.getRowCount());
+ metadata.set(PagedText.N_PAGES, (int) props.getPageCount());
+ metadata.set(Database.COLUMN_COUNT, (int) props.getColumnsCount());
+ metadata.set(Database.ROW_COUNT, (int) props.getRowCount());
// TODO Can we find more general properties for these / move
// these to more general places?
metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding());
metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType());
metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease());
- metadata.set(MachineMetadata.ARCHITECTURE_BITS,
- props.isU64() ? "64" : "32");
- metadata.set(MachineMetadata.ENDIAN, props.getEndianness() == 1 ?
- MachineMetadata.Endian.LITTLE.getName() :
- MachineMetadata.Endian.BIG.getName());
+ metadata.set(MachineMetadata.ARCHITECTURE_BITS, props.isU64() ? "64" : "32");
+ metadata.set(MachineMetadata.ENDIAN,
+ props.getEndianness() == 1 ? MachineMetadata.Endian.LITTLE.getName() :
+ MachineMetadata.Endian.BIG.getName());
// The following SAS Metadata fields are currently ignored:
// compressionMethod
@@ -109,7 +105,9 @@ public class SAS7BDATParser extends AbstractParser {
// TODO Find keys to record the format and the type
for (Column c : sas.getColumns()) {
String name = c.getLabel();
- if (name == null || name.isEmpty()) name = c.getName();
+ if (name == null || name.isEmpty()) {
+ name = c.getName();
+ }
metadata.add(Database.COLUMN_NAME, name);
}
@@ -118,12 +116,14 @@ public class SAS7BDATParser extends AbstractParser {
xhtml.element("h1", props.getName());
xhtml.startElement("table");
xhtml.newline();
-
+
// Do the column headings
xhtml.startElement("tr");
for (Column c : sas.getColumns()) {
String label = c.getLabel();
- if (label == null || label.isEmpty()) label = c.getName();
+ if (label == null || label.isEmpty()) {
+ label = c.getName();
+ }
xhtml.startElement("th", "title", c.getName());
xhtml.characters(label);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
index 5e08749..f255199 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
@@ -18,10 +18,12 @@ package org.apache.tika.parser.asm;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
/**
* Test case for parsing Java class files.
@@ -33,24 +35,18 @@ public class ClassParserTest extends TikaTest {
Metadata metadata = new Metadata();
String content = getText("AutoDetectParser.class", metadata);
assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
- assertEquals(
- "AutoDetectParser.class",
- metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("AutoDetectParser.class", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertTrue(content.contains("package org.apache.tika.parser;"));
+ assertTrue(content.contains("class AutoDetectParser extends CompositeParser"));
+ assertTrue(content.contains("private org.apache.tika.mime.MimeTypes types"));
assertTrue(content.contains(
- "class AutoDetectParser extends CompositeParser"));
- assertTrue(content.contains(
- "private org.apache.tika.mime.MimeTypes types"));
- assertTrue(content.contains(
- "public void parse("
- + "java.io.InputStream, org.xml.sax.ContentHandler,"
- + " org.apache.tika.metadata.Metadata) throws"
- + " java.io.IOException, org.xml.sax.SAXException,"
- + " org.apache.tika.exception.TikaException;"));
- assertTrue(content.contains(
- "private byte[] getPrefix(java.io.InputStream, int)"
- + " throws java.io.IOException;"));
+ "public void parse(" + "java.io.InputStream, org.xml.sax.ContentHandler," +
+ " org.apache.tika.metadata.Metadata) throws" +
+ " java.io.IOException, org.xml.sax.SAXException," +
+ " org.apache.tika.exception.TikaException;"));
+ assertTrue(content.contains("private byte[] getPrefix(java.io.InputStream, int)" +
+ " throws java.io.IOException;"));
}
@Test
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 9d3a97e..8b80cd8 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -16,6 +16,17 @@
*/
package org.apache.tika.parser.code;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -23,16 +34,6 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-import java.io.ByteArrayInputStream;
-import java.util.Set;
... 63883 lines suppressed ...