You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/27 13:28:42 UTC
[1/2] tika git commit: TIKA-2022 -- add applefile parser
Repository: tika
Updated Branches:
refs/heads/master 2031de70c -> 0f3b0bdb5
TIKA-2022 -- add applefile parser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/47221b90
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/47221b90
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/47221b90
Branch: refs/heads/master
Commit: 47221b90624eb1bba990a1930cb4163489883d8b
Parents: 2031de7
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 27 09:27:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 27 09:27:29 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/47221b90/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1a46467..7e00048 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.14 - ???
+ * Add parser for applefile (AppleSingle) (TIKA-2022)
+
* Add mime types, mime magic and/or globs for:
* Endnote Import File (TIKA-2011)
* DJVU files (TIKA-2009)
[2/2] tika git commit: TIKA-2022 -- add applefile parser
Posted by ta...@apache.org.
TIKA-2022 -- add applefile parser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0f3b0bdb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0f3b0bdb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0f3b0bdb
Branch: refs/heads/master
Commit: 0f3b0bdb5b78177e9f0fca88f889e7919823c177
Parents: 47221b9
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 27 09:28:07 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 27 09:28:07 2016 -0400
----------------------------------------------------------------------
.../parser/apple/AppleSingleFileParser.java | 205 +++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 1 +
.../parser/apple/AppleSingleFileParserTest.java | 46 +++++
.../test-documents/testAppleSingleFile.pdf | Bin 0 -> 54926 bytes
4 files changed, 252 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
new file mode 100644
index 0000000..789629e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.apple;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that strips the header off of AppleSingle and AppleDouble
+ * files.
+ * <p>
+ * See <a href="http://kaiser-edv.de/documents/AppleSingle_AppleDouble.pdf">spec document</a>.
+ */
+public class AppleSingleFileParser extends AbstractParser {
+
+ /**
+ * Entry types
+ */
+ public static final int DATA_FORK = 1;
+ public static final int RESOURCE_FORK = 2;
+ public static final int REAL_NAME = 3;
+ public static final int COMMENT = 4;
+ public static final int ICON_BW = 5;
+ public static final int ICON_COLOR = 6;
+ //7?!
+ public static final int FILE_DATES_INFO = 8;
+ public static final int FINDER_INFO = 9;
+ public static final int MACINTOSH_FILE_INFO = 10;
+ public static final int PRODOS_FILE_INFO = 11;
+ public static final int MSDOS_FILE_INFO = 12;
+ public static final int SHORT_NAME = 13;
+ public static final int AFP_FILE_INFO = 14;
+ public static final int DIRECTORY_ID = 15;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("applefile"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ ex = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ short numEntries = readThroughNumEntries(stream);
+ long bytesRead = 26;
+ List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
+ bytesRead += 12*numEntries;
+ Metadata embeddedMetadata = new Metadata();
+ bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
+ FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ if (contentFieldInfo != null) {
+ System.out.println(contentFieldInfo.offset + " "+bytesRead);
+ long diff = contentFieldInfo.offset-bytesRead;
+ IOUtils.skipFully(stream, diff);
+ if (ex.shouldParseEmbedded(embeddedMetadata)) {
+ // TODO: we should probably add a readlimiting wrapper around this
+ // stream to ensure that not more than contentFieldInfo.length bytes
+ // are read
+ ex.parseEmbedded(new CloseShieldInputStream(stream),
+ xhtml, embeddedMetadata, false);
+ }
+ }
+ xhtml.endDocument();
+
+ }
+
+ private FieldInfo getContentFieldInfo(List<FieldInfo> fieldInfoList) {
+ for (FieldInfo fieldInfo : fieldInfoList) {
+ if (fieldInfo.entryId == 1) {
+ return fieldInfo;
+ }
+ }
+ return null;
+ }
+
+ private long processFieldEntries(InputStream stream, List<FieldInfo> fieldInfoList,
+ Metadata embeddedMetadata, long bytesRead) throws IOException, TikaException {
+ byte[] buffer = null;
+ for (FieldInfo f : fieldInfoList) {
+ long diff = f.offset - bytesRead;
+ //just in case
+ IOUtils.skipFully(stream, diff);
+ bytesRead += diff;
+ if (f.entryId == REAL_NAME) {
+ if (f.length > Integer.MAX_VALUE) {
+ throw new TikaException("File name length can't be > integer max");
+ }
+ buffer = new byte[(int)f.length];
+ IOUtils.readFully(stream, buffer);
+ bytesRead += f.length;
+ String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
+ //TODO: figure out correct metadata key
+ //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName);
+ } else if (f.entryId != DATA_FORK) {
+ IOUtils.skipFully(stream, f.length);
+ bytesRead += f.length;
+ }
+ }
+ return bytesRead;
+ }
+
+
+ private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries) throws IOException, TikaException {
+ //this is probably overkill. I'd hope that these were already
+ //in order. This ensures it.
+ List<FieldInfo> fieldInfoList = new ArrayList<>(numEntries);
+ for (int i = 0; i < numEntries; i++) {
+ //convert 32-bit unsigned ints to longs
+ fieldInfoList.add(
+ new FieldInfo(
+ EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //entry id
+ EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //offset
+ EndianUtils.readIntBE(stream) & 0x00000000ffffffffL //length
+ )
+ );
+ }
+ if (fieldInfoList.size() == 0) {
+ throw new TikaException("AppleSingleFile missing field info");
+ }
+ //make absolutely sure these are in order!
+ Collections.sort(fieldInfoList, new FieldInfoComparator());
+ return fieldInfoList;
+ }
+
+ //read through header until you hit the number of entries
+ private short readThroughNumEntries(InputStream stream) throws TikaException, IOException {
+ //mime
+ EndianUtils.readIntBE(stream);
+ //version
+ long version = EndianUtils.readIntBE(stream);
+ if (version != 0x00020000) {
+ throw new TikaException("Version should have been 0x00020000, but was:"+version);
+ }
+ IOUtils.skipFully(stream, 16);//filler
+ return EndianUtils.readShortBE(stream);//number of entries
+ }
+
+ private class FieldInfo {
+
+ private final long entryId;
+ private final long offset;
+ private final long length;
+
+ private FieldInfo(long entryId, long offset, long length) {
+ this.entryId = entryId;
+ this.offset = offset;
+ this.length = length;
+ }
+ }
+
+ private static class FieldInfoComparator implements Comparator<FieldInfo> {
+
+ @Override
+ public int compare(FieldInfo o1, FieldInfo o2) {
+ return (o1.offset > o2.offset) ? 1 :
+ (o1.offset == o2.offset) ? 0 : -1 ;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 10a5a7e..6ed2f6c 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.tika.parser.apple.AppleSingleFileParser
org.apache.tika.parser.asm.ClassParser
org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
new file mode 100644
index 0000000..5890e7e
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.apple;
+
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.junit.Test;
+
+public class AppleSingleFileParserTest extends TikaTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ List<Metadata> list = getRecursiveJson("testAppleSingleFile.pdf");
+ assertEquals(list.size(), 2);
+ assertContains(AppleSingleFileParser.class.getName(),
+ Arrays.asList(list.get(0).getValues("X-Parsed-By")));
+ assertContains(PDFParser.class.getName(),
+ Arrays.asList(list.get(1).getValues("X-Parsed-By")));
+ assertContains("END OF SORTIE NUMBER TWO", list.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("fltsyllabussortie2rev1.2", list.get(1).get(TikaCoreProperties.TITLE));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf
new file mode 100644
index 0000000..a385313
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf differ