You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/05/13 20:47:20 UTC
svn commit: r1337962 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/executable/
tika-parsers/src/main/resources/META-INF/services/
tika-parsers/src/test/java/org/apache/tika/parser...
Author: nick
Date: Sun May 13 18:47:19 2012
New Revision: 1337962
URL: http://svn.apache.org/viewvc?rev=1337962&view=rev
Log:
TIKA-917 Start on a parser for PE and ELF executables, to output metadata
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1337962&r1=1337961&r2=1337962&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sun May 13 18:47:19 2012
@@ -2450,7 +2450,8 @@
<sub-class-of type="application/x-elf"/>
<magic priority="50">
<match value="\177ELF" type="string" offset="0">
- <match value="0x01" type="string" offset="16"/>
+ <match value="0x0100" type="string" offset="16"/>
+ <match value="0x0001" type="string" offset="16"/>
</match>
</magic>
</mime-type>
@@ -2458,7 +2459,8 @@
<sub-class-of type="application/x-elf"/>
<magic priority="50">
<match value="\177ELF" type="string" offset="0">
- <match value="0x02" type="string" offset="16"/>
+ <match value="0x0200" type="string" offset="16"/>
+ <match value="0x0002" type="string" offset="16"/>
</match>
</magic>
</mime-type>
@@ -2466,7 +2468,8 @@
<sub-class-of type="application/x-elf"/>
<magic priority="50">
<match value="\177ELF" type="string" offset="0">
- <match value="0x03" type="string" offset="16"/>
+ <match value="0x0300" type="string" offset="16"/>
+ <match value="0x0003" type="string" offset="16"/>
</match>
</magic>
</mime-type>
@@ -2474,7 +2477,8 @@
<sub-class-of type="application/x-elf"/>
<magic priority="50">
<match value="\177ELF" type="string" offset="0">
- <match value="0x04" type="string" offset="16"/>
+ <match value="0x0400" type="string" offset="16"/>
+ <match value="0x0004" type="string" offset="16"/>
</match>
</magic>
</mime-type>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java?rev=1337962&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java Sun May 13 18:47:19 2012
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Date;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for executable files. Currently supports ELF and PE
+ */
+public class ExecutableParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 32128791892482l;
+
+ // TODO Put these somewhere more general
+ public static final String MACHINE_x86_32 = "x86-32";
+ public static final String MACHINE_x86_64 = "x86-64";
+ public static final String MACHINE_IA_64 = "IA-64";
+ public static final String MACHINE_UNKNOWN = "Unknown";
+ // TODO The rest
+ public static Property MACHINE_TYPE = Property.internalClosedChoise("machine",
+ new String[] { MACHINE_x86_32, MACHINE_x86_64, MACHINE_UNKNOWN });
+
+ public static Property ARCHITECTURE = Property.internalClosedChoise("architecture",
+ new String[] { "32", "64" });
+
+ public static final class Endian {
+ private String name;
+ private boolean msb;
+ public String getName() { return name; }
+ public boolean isMSB() { return msb; }
+ public String getMSB() { if(msb) { return "MSB"; } else { return "LSB"; } }
+ private Endian(String name, boolean msb) { this.name = name; this.msb = msb; }
+
+ public static final Endian LITTLE_ENDIAN = new Endian("Little", false);
+ public static final Endian BIG_ENDIAN = new Endian("Big", true);
+ }
+ public static Property ENDIAN = Property.internalClosedChoise("endian",
+ new String[] { Endian.LITTLE_ENDIAN.name, Endian.BIG_ENDIAN.name });
+
+ private static final MediaType PE_EXE = MediaType.application("x-msdownload");
+ private static final MediaType ELF_GENERAL = MediaType.application("x-elf");
+ private static final MediaType ELF_OBJECT = MediaType.application("x-object");
+ private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
+ private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
+ private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ PE_EXE,
+ ELF_GENERAL,
+ ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
+ )));
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // We only do metadata, for now
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ // What kind is it?
+ byte[] first4 = new byte[4];
+ IOUtils.readFully(stream, first4);
+
+ if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
+ parsePE(xhtml, metadata, stream, first4);
+ } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
+ first4[2] == (byte)'L' && first4[3] == (byte)'F') {
+ parseELF(xhtml, metadata, stream, first4);
+ }
+
+
+ // Finish everything
+ xhtml.endDocument();
+ }
+
+ /**
+ * Parses a DOS or Windows PE file
+ */
+ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
+ InputStream stream, byte[] first4) throws TikaException, IOException {
+ metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString());
+
+ // Skip over the MS-DOS bit
+ byte[] msdosSection = new byte[0x3c-4];
+ IOUtils.readFully(stream, msdosSection);
+
+ // Grab the PE header offset
+ int peOffset = LittleEndian.readInt(stream);
+
+ // Sanity check - while it may go anywhere, it's normally in the first few kb
+ if (peOffset > 4096 || peOffset < 0x3f) return;
+
+ // Skip the rest of the MS-DOS stub (if PE), until we reach what should
+ // be the PE header (if this is a PE executable)
+ stream.skip(peOffset - 0x40);
+
+ // Read the PE header
+ byte[] pe = new byte[24];
+ IOUtils.readFully(stream, pe);
+
+ // Check it really is a PE header
+ if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
+ // Good, has a valid PE signature
+ } else {
+ // Old style MS-DOS
+ return;
+ }
+
+ // Read the header values
+ int machine = LittleEndian.getUShort(pe, 4);
+ int numSectors = LittleEndian.getUShort(pe, 6);
+ long createdAt = LittleEndian.getInt(pe, 8);
+ long symbolTableOffset = LittleEndian.getInt(pe, 12);
+ long numSymbols = LittleEndian.getInt(pe, 16);
+ int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
+ int characteristcs = LittleEndian.getUShort(pe, 22);
+
+ // Turn this into helpful metadata
+ Date createdAtD = new Date(createdAt*1000l);
+ metadata.set(Metadata.CREATION_DATE, createdAtD);
+
+ switch(machine) {
+ case 0x14c:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE_ENDIAN.name);
+ metadata.set(ARCHITECTURE, "32");
+ break;
+
+ case 0x8664:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE_ENDIAN.name);
+ metadata.set(ARCHITECTURE, "64");
+ break;
+
+ case 0x200:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ metadata.set(ENDIAN, Endian.LITTLE_ENDIAN.name);
+ metadata.set(ARCHITECTURE, "64");
+ break;
+
+ default:
+ metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
+ break;
+ }
+ }
+
+ /**
+ * Parses a Unix ELF file
+ */
+ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
+ InputStream stream, byte[] first4) throws TikaException, IOException {
+ // Byte 5 is the architecture
+ int architecture = stream.read();
+ if (architecture == 1) {
+ metadata.set(ARCHITECTURE, "32");
+ } else if (architecture == 2) {
+ metadata.set(ARCHITECTURE, "64");
+ }
+
+ // Byte 6 is the endian-ness
+ int endian = stream.read();
+ if (endian == 1) {
+ metadata.set(ENDIAN, Endian.LITTLE_ENDIAN.name);
+ } else if (endian == 2) {
+ metadata.set(ENDIAN, Endian.BIG_ENDIAN.name);
+ }
+
+ // Byte 7 is the elf version
+ int elfVer = stream.read();
+
+ // Byte 8 is the OS
+ // Byte 9 is the OS (specific) ABI version
+ int os = stream.read();
+ int osVer = stream.read();
+
+ // Bytes 10-16 are padding and lengths
+ byte[] padLength = new byte[7];
+ IOUtils.readFully(stream, padLength);
+
+ // Bytes 16-17 are the object type (LE/BE)
+ int type;
+ if (endian == 1) {
+ type = EndianUtils.readUShortLE(stream);
+ } else {
+ type = EndianUtils.readUShortBE(stream);
+ }
+ switch(type) {
+ case 1:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
+ break;
+
+ case 2:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
+ break;
+
+ case 3:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
+ break;
+
+ case 4:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
+ break;
+
+ default:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
+ break;
+ }
+
+ // Bytes 18-19 are the machine (EM_*)
+ int machine;
+ if (endian == 1) {
+ machine = EndianUtils.readUShortLE(stream);
+ } else {
+ machine = EndianUtils.readUShortBE(stream);
+ }
+ switch(machine) {
+ case 3:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ break;
+ case 50:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ break;
+ case 62:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_64);
+ break;
+ }
+
+ // Bytes 20-23 are the version
+ // TODO
+ }
+}
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1337962&r1=1337961&r2=1337962&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sun May 13 18:47:19 2012
@@ -18,6 +18,7 @@ org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
org.apache.tika.parser.dwg.DWGParser
org.apache.tika.parser.epub.EpubParser
+org.apache.tika.parser.executable.ExecutableParser
org.apache.tika.parser.feed.FeedParser
org.apache.tika.parser.font.AdobeFontMetricParser
org.apache.tika.parser.font.TrueTypeParser
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java?rev=1337962&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java Sun May 13 18:47:19 2012
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class ExecutableParserTest extends TestCase {
+
+ public void testWin32Parser() throws Exception {
+ InputStream input = ExecutableParserTest.class.getResourceAsStream(
+ "/test-documents/testWindows-x86-32.exe");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/x-msdownload",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2012-05-13T13:40:11Z",
+ metadata.get(Metadata.CREATION_DATE));
+
+ assertEquals(ExecutableParser.MACHINE_x86_32,
+ metadata.get(ExecutableParser.MACHINE_TYPE));
+ assertEquals("Little",
+ metadata.get(ExecutableParser.ENDIAN));
+ assertEquals("32",
+ metadata.get(ExecutableParser.ARCHITECTURE));
+
+ String content = handler.toString();
+ assertEquals("", content); // No text yet
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testElfParser_x86_32() throws Exception {
+ InputStream input = ExecutableParserTest.class.getResourceAsStream(
+ "/test-documents/testLinux-x86-32");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/x-executable",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals(ExecutableParser.MACHINE_x86_32,
+ metadata.get(ExecutableParser.MACHINE_TYPE));
+ assertEquals("Little",
+ metadata.get(ExecutableParser.ENDIAN));
+ assertEquals("32",
+ metadata.get(ExecutableParser.ARCHITECTURE));
+
+ String content = handler.toString();
+ assertEquals("", content); // No text yet
+ } finally {
+ input.close();
+ }
+ }
+
+}