You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC

svn commit: r1725014 [3/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-mo...

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.objectweb.asm.AnnotationVisitor;
+import org.objectweb.asm.Attribute;
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+import org.objectweb.asm.Type;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Class visitor that generates XHTML SAX events to describe the
+ * contents of the visited class.
+ */
+class XHTMLClassVisitor extends ClassVisitor {
+
+    private final XHTMLContentHandler xhtml;
+
+    private final Metadata metadata;
+
+    private Type type;
+
+    private String packageName;
+
+    public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
+        super(Opcodes.ASM5);
+        this.xhtml = new XHTMLContentHandler(handler, metadata);
+        this.metadata = metadata;
+    }
+
+    public void parse(InputStream stream)
+            throws TikaException, SAXException, IOException {
+        try {
+            ClassReader reader = new ClassReader(stream);
+            reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
+        } catch (RuntimeException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Failed to parse a Java class", e);
+            }
+        }
+    }
+
+    public void visit(
+            int version, int access, String name, String signature,
+            String superName, String[] interfaces) {
+        type = Type.getObjectType(name);
+
+        String className = type.getClassName();
+        int dot = className.lastIndexOf('.');
+        if (dot != -1) {
+            packageName = className.substring(0, dot);
+            className = className.substring(dot + 1);
+        }
+
+        metadata.set(TikaCoreProperties.TITLE, className);
+        metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
+
+        try {
+            xhtml.startDocument();
+            xhtml.startElement("pre");
+
+            if (packageName != null) {
+                writeKeyword("package");
+                xhtml.characters(" " + packageName + ";\n");
+            }
+
+            writeAccess(access);
+            if (isSet(access, Opcodes.ACC_INTERFACE)) {
+                writeKeyword("interface");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+                writeInterfaces("extends", interfaces);
+            } else if (isSet(access, Opcodes.ACC_ENUM)) {
+                writeKeyword("enum");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+            } else {
+                writeKeyword("class");
+                writeSpace();
+                writeType(type);
+                writeSpace();
+                if (superName != null) {
+                    Type superType = Type.getObjectType(superName);
+                    if (!superType.getClassName().equals("java.lang.Object")) {
+                        writeKeyword("extends");
+                        writeSpace();
+                        writeType(superType);
+                        writeSpace();
+                    }
+                }
+                writeInterfaces("implements", interfaces);
+            }
+            xhtml.characters("{\n");
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private void writeInterfaces(String keyword, String[] interfaces)
+            throws SAXException {
+        if (interfaces != null && interfaces.length > 0) {
+            writeKeyword(keyword);
+            String separator = " ";
+            for (String iface : interfaces) {
+                xhtml.characters(separator);
+                writeType(Type.getObjectType(iface));
+                separator = ", ";
+            }
+            writeSpace();
+        }
+    }
+
+    public void visitEnd() {
+        try {
+            xhtml.characters("}\n");
+            xhtml.endElement("pre");
+            xhtml.endDocument();
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitOuterClass(String owner, String name, String desc) {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitSource(String source, String debug) {
+    }
+
+
+    /**
+     * Ignored.
+     */
+    public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
+        return null;
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitAttribute(Attribute attr) {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void visitInnerClass(
+            String name, String outerName, String innerName, int access) {
+    }
+
+    /**
+     * Visits a field.
+     */
+    public FieldVisitor visitField(
+            int access, String name, String desc, String signature,
+            Object value) {
+        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+            try {
+                xhtml.characters("    ");
+                writeAccess(access);
+                writeType(Type.getType(desc));
+                writeSpace();
+                writeIdentifier(name);
+
+                if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
+                    xhtml.characters(" = ");
+                    xhtml.characters(value.toString());
+                }
+
+                writeSemicolon();
+                writeNewline();
+            } catch (SAXException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        return null;
+    }
+
+    /**
+     * Visits a method.
+     */
+    public MethodVisitor visitMethod(
+            int access, String name, String desc, String signature,
+            String[] exceptions) {
+        if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+            try {
+                xhtml.characters("    ");
+                writeAccess(access);
+                writeType(Type.getReturnType(desc));
+                writeSpace();
+                if ("<init>".equals(name)) {
+                    writeType(type);
+                } else {
+                    writeIdentifier(name);
+                }
+
+                xhtml.characters("(");
+                String separator = "";
+                for (Type arg : Type.getArgumentTypes(desc)) {
+                    xhtml.characters(separator);
+                    writeType(arg);
+                    separator = ", ";
+                }
+                xhtml.characters(")");
+
+                if (exceptions != null && exceptions.length > 0) {
+                    writeSpace();
+                    writeKeyword("throws");
+                    separator = " ";
+                    for (String exception : exceptions) {
+                        xhtml.characters(separator);
+                        writeType(Type.getObjectType(exception));
+                        separator = ", ";
+                    }
+                }
+
+                writeSemicolon();
+                writeNewline();
+            } catch (SAXException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        return null;
+    }
+
+    private void writeIdentifier(String identifier) throws SAXException {
+        xhtml.startElement("span", "class", "java-identifier");
+        xhtml.characters(identifier);
+        xhtml.endElement("span");
+    }
+
+    private void writeKeyword(String keyword) throws SAXException {
+        xhtml.startElement("span", "class", "java-keyword");
+        xhtml.characters(keyword);
+        xhtml.endElement("span");
+    }
+
+    private void writeSemicolon() throws SAXException {
+        xhtml.characters(";");
+    }
+
+    private void writeSpace() throws SAXException {
+        xhtml.characters(" ");
+    }
+
+    private void writeNewline() throws SAXException {
+        xhtml.characters("\n");
+    }
+
+    private void writeAccess(int access) throws SAXException {
+        writeAccess(access, Opcodes.ACC_PRIVATE, "private");
+        writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
+        writeAccess(access, Opcodes.ACC_PUBLIC, "public");
+        writeAccess(access, Opcodes.ACC_STATIC, "static");
+        writeAccess(access, Opcodes.ACC_FINAL, "final");
+        writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
+        writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
+        writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
+        writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
+        writeAccess(access, Opcodes.ACC_NATIVE, "native");
+    }
+
+    private void writeAccess(int access, int code, String keyword)
+            throws SAXException {
+        if (isSet(access, code)) {
+            writeKeyword(keyword);
+            xhtml.characters(" ");
+        }
+    }
+
+    private void writeType(Type type) throws SAXException {
+        String name = type.getClassName();
+        if (name.startsWith(packageName + ".")) {
+            xhtml.characters(name.substring(packageName.length() + 1));
+        } else if (name.startsWith("java.lang.")) {
+            xhtml.characters(name.substring("java.lang.".length()));
+        } else {
+            xhtml.characters(name);
+        }
+    }
+
+    private static boolean isSet(int value, int flag) {
+        return (value & flag) != 0;
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.uwyn.jhighlight.renderer.Renderer;
+import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
+/**
+ * Generic Source code parser for Java, Groovy, C++.
+ * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
+ *
+ * @author Hong-Thai.Nguyen
+ * @since 1.6
+ */
+public class SourceCodeParser implements Parser {
+
+  private static final long serialVersionUID = -4543476498190054160L;
+
+  private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
+
+  private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
+    private static final long serialVersionUID = -741976157563751152L;
+    {
+      put(MediaType.text("x-c++src"), CPP);
+      put(MediaType.text("x-java-source"), JAVA);
+      put(MediaType.text("x-groovy"), GROOVY);
+    }
+  };
+
+  private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
+  
+  //Parse the HTML document
+  private static final Schema HTML_SCHEMA = new HTMLSchema();
+  
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return TYPES_TO_RENDERER.keySet();
+  }
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+      throws IOException, SAXException, TikaException {
+
+    try (AutoDetectReader reader = new AutoDetectReader(
+            new CloseShieldInputStream(stream), metadata,
+            context.get(ServiceLoader.class, LOADER))) {
+      Charset charset = reader.getCharset();
+      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+      if (mediaType != null && name != null) {
+        MediaType type = MediaType.parse(mediaType);
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
+        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+        StringBuilder out = new StringBuilder();
+        String line;
+        int nbLines =  0;
+        while ((line = reader.readLine()) != null) {
+            out.append(line + System.getProperty("line.separator"));
+            String author = parserAuthor(line);
+            if (author != null) {
+              metadata.add(TikaCoreProperties.CREATOR, author);
+            }
+            nbLines ++;
+        }
+        metadata.set("LoC", String.valueOf(nbLines));
+        Renderer renderer = getRenderer(type.toString());
+
+        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+
+        Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+        parser.setContentHandler(handler);
+        parser.parse(new InputSource(new StringReader(codeAsHtml)));
+      }
+    }
+
+  }
+
+  private Renderer getRenderer(String mimeType) {
+    MediaType mt = MediaType.parse(mimeType);
+    String type = TYPES_TO_RENDERER.get(mt);
+    if (type == null) {
+      throw new RuntimeException("unparseable content type " + mimeType);
+    }
+    return XhtmlRendererFactory.getRenderer(type);
+  }
+
+
+  private String parserAuthor(String line) {
+    Matcher m = authorPattern.matcher(line);
+    if (m.find()) {
+      return m.group(1).trim();
+    }
+
+    return null;
+  }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Date;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for executable files. Currently supports ELF and PE
+ */
+public class ExecutableParser extends AbstractParser implements MachineMetadata {
+    /** Serial version UID */
+    private static final long serialVersionUID = 32128791892482l;
+
+    private static final MediaType PE_EXE = MediaType.application("x-msdownload");
+    private static final MediaType ELF_GENERAL = MediaType.application("x-elf");
+    private static final MediaType ELF_OBJECT = MediaType.application("x-object");
+    private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
+    private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
+    private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+            		PE_EXE,
+                  ELF_GENERAL,
+                  ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
+            )));
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // We only do metadata, for now
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        // What kind is it?
+        byte[] first4 = new byte[4];
+        IOUtils.readFully(stream, first4);
+        
+        if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
+           parsePE(xhtml, metadata, stream, first4);
+        } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
+                   first4[2] == (byte)'L' && first4[3] == (byte)'F') {
+           parseELF(xhtml, metadata, stream, first4);
+        }
+        
+        
+        // Finish everything
+        xhtml.endDocument();
+    }
+
+    /**
+     * Parses a DOS or Windows PE file
+     */
+    public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
+          InputStream stream, byte[] first4) throws TikaException, IOException {
+       metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString());
+       metadata.set(PLATFORM, PLATFORM_WINDOWS);
+       
+       // Skip over the MS-DOS bit
+       byte[] msdosSection = new byte[0x3c-4];
+       IOUtils.readFully(stream, msdosSection);
+       
+       // Grab the PE header offset
+       int peOffset = LittleEndian.readInt(stream);
+       
+       // Sanity check - while it may go anywhere, it's normally in the first few kb
+       if (peOffset > 4096 || peOffset < 0x3f) return;
+       
+       // Skip the rest of the MS-DOS stub (if PE), until we reach what should
+       //  be the PE header (if this is a PE executable)
+       stream.skip(peOffset - 0x40);
+       
+       // Read the PE header
+       byte[] pe = new byte[24];
+       IOUtils.readFully(stream, pe);
+       
+       // Check it really is a PE header
+       if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
+          // Good, has a valid PE signature
+       } else {
+          // Old style MS-DOS
+          return;
+       }
+       
+       // Read the header values
+       int machine    = LittleEndian.getUShort(pe, 4);
+       int numSectors = LittleEndian.getUShort(pe, 6);
+       long createdAt = LittleEndian.getInt(pe, 8);
+       long symbolTableOffset = LittleEndian.getInt(pe, 12);
+       long numSymbols = LittleEndian.getInt(pe, 16);
+       int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
+       int characteristcs = LittleEndian.getUShort(pe, 22);
+       
+       // Turn this into helpful metadata
+       Date createdAtD = new Date(createdAt*1000l);
+       metadata.set(Metadata.CREATION_DATE, createdAtD);
+       
+       switch(machine) {
+         case 0x14c:
+            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+         case 0x8664:
+            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "64");
+            break;
+         case 0x200:
+            metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "64");
+            break;
+            
+         case 0x184:
+            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+         case 0x284:
+            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "64");
+            break;
+            
+         case 0x1c0:
+         case 0x1c4:
+            metadata.set(MACHINE_TYPE, MACHINE_ARM);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+
+         case 0x268:
+            metadata.set(MACHINE_TYPE, MACHINE_M68K);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+
+         case 0x266:
+         case 0x366:
+         case 0x466:
+            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "16");
+            break;
+         case 0x162:
+         case 0x166:
+         case 0x168:
+         case 0x169:
+            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "16");
+            break;
+            
+         case 0x1f0:
+         case 0x1f1:
+            metadata.set(MACHINE_TYPE, MACHINE_PPC);
+            metadata.set(ENDIAN, Endian.LITTLE.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+            
+         case 0x1a2:
+         case 0x1a3:
+            metadata.set(MACHINE_TYPE, MACHINE_SH3);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+         case 0x1a6:
+            metadata.set(MACHINE_TYPE, MACHINE_SH4);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+         case 0x1a8:
+            metadata.set(MACHINE_TYPE, MACHINE_SH3);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+
+         case 0x9041:
+            metadata.set(MACHINE_TYPE, MACHINE_M32R);
+            metadata.set(ENDIAN, Endian.BIG.getName());
+            metadata.set(ARCHITECTURE_BITS, "32");
+            break;
+
+         case 0xebc:
+            metadata.set(MACHINE_TYPE, MACHINE_EFI);
+            break;
+
+         default:
+            metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
+            break;
+       }
+    }
+
+    /**
+     * Parses a Unix ELF file
+     */
+    public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
+          InputStream stream, byte[] first4) throws TikaException, IOException {
+       // Byte 5 is the architecture
+       int architecture = stream.read();
+       if (architecture == 1) {
+          metadata.set(ARCHITECTURE_BITS, "32");
+       } else if (architecture == 2) {
+          metadata.set(ARCHITECTURE_BITS, "64");          
+       }
+       
+       // Byte 6 is the endian-ness
+       int endian = stream.read();
+       if (endian == 1) {
+          metadata.set(ENDIAN, Endian.LITTLE.getName());
+       } else if (endian == 2) {
+          metadata.set(ENDIAN, Endian.BIG.getName());
+       }
+       
+       // Byte 7 is the elf version
+       int elfVer = stream.read();
+       
+       // Byte 8 is the OS, if set (lots of compilers don't)
+       // Byte 9 is the OS (specific) ABI version
+       int os = stream.read();
+       int osVer = stream.read();
+       if (os > 0 || osVer > 0)
+       {
+          switch (os) {
+          case 0:
+             metadata.set(PLATFORM, PLATFORM_SYSV);
+             break;
+
+          case 1:
+             metadata.set(PLATFORM, PLATFORM_HPUX);
+             break;
+
+          case 2:
+             metadata.set(PLATFORM, PLATFORM_NETBSD);
+             break;
+
+          case 3:
+             metadata.set(PLATFORM, PLATFORM_LINUX);
+             break;
+
+          case 6:
+             metadata.set(PLATFORM, PLATFORM_SOLARIS);
+             break;
+
+          case 7:
+             metadata.set(PLATFORM, PLATFORM_AIX);
+             break;
+
+          case 8:
+             metadata.set(PLATFORM, PLATFORM_IRIX);
+             break;
+
+          case 9:
+             metadata.set(PLATFORM, PLATFORM_FREEBSD);
+             break;
+
+          case 10:
+             metadata.set(PLATFORM, PLATFORM_TRU64);
+             break;
+
+          case 12:
+             metadata.set(PLATFORM, PLATFORM_FREEBSD);
+             break;
+
+          case 64:
+          case 97:
+             metadata.set(PLATFORM, PLATFORM_ARM);
+             break;
+
+          case 255:
+             metadata.set(PLATFORM, PLATFORM_EMBEDDED);
+             break;
+          }
+       }
+       
+       // Bytes 10-16 are padding and lengths
+       byte[] padLength = new byte[7];
+       IOUtils.readFully(stream, padLength);
+       
+       // Bytes 16-17 are the object type (LE/BE)
+       int type;
+       if (endian == 1) {
+          type = EndianUtils.readUShortLE(stream);
+       } else {
+          type = EndianUtils.readUShortBE(stream);
+       }
+       switch(type) {
+         case 1:
+            metadata.add(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
+            break;
+            
+         case 2:
+            metadata.add(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
+            break;
+            
+         case 3:
+            metadata.add(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
+            break;
+            
+         case 4:
+            metadata.add(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
+            break;
+            
+         default:
+            metadata.add(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
+            break;
+       }
+                 
+       // Bytes 18-19 are the machine (EM_*)
+       int machine;
+       if (endian == 1) {
+          machine = EndianUtils.readUShortLE(stream);
+       } else {
+          machine = EndianUtils.readUShortBE(stream);
+       }
+       switch(machine) {
+         case 2:
+         case 18:
+         case 43:
+            metadata.set(MACHINE_TYPE, MACHINE_SPARC);
+            break;
+         case 3:
+            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+            break;
+         case 4:
+            metadata.set(MACHINE_TYPE, MACHINE_M68K);
+            break;
+         case 5:
+            metadata.set(MACHINE_TYPE, MACHINE_M88K);
+            break;
+         case 8:
+         case 10:
+            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+            break;
+         case 7:
+            metadata.set(MACHINE_TYPE, MACHINE_S370);
+            break;
+         case 20:
+         case 21:
+            metadata.set(MACHINE_TYPE, MACHINE_PPC);
+            break;
+         case 22:
+            metadata.set(MACHINE_TYPE, MACHINE_S390);
+            break;
+         case 40:
+            metadata.set(MACHINE_TYPE, MACHINE_ARM);
+            break;
+         case 41:
+         case 0x9026:
+            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+            break;
+         case 50:
+            metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+            break;
+         case 62:
+            metadata.set(MACHINE_TYPE, MACHINE_x86_64);
+            break;
+         case 75:
+            metadata.set(MACHINE_TYPE, MACHINE_VAX);
+            break;
+         case 88:
+            metadata.set(MACHINE_TYPE, MACHINE_M32R);
+            break;
+       }
+       
+       
+       
+       // Bytes 20-23 are the version
+       // TODO
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import org.apache.tika.metadata.Property;
+
+/**
+ * Metadata for describing machines, such as their
+ *  architecture, type and endian-ness
+ */
+public interface MachineMetadata {
+    public static final String PREFIX = "machine:";
+   
+    public static Property ARCHITECTURE_BITS = Property.internalClosedChoise(PREFIX+"architectureBits", 
+         "8", "16", "32", "64");
+
+    public static final String PLATFORM_SYSV    = "System V";
+    public static final String PLATFORM_HPUX    = "HP-UX";
+    public static final String PLATFORM_NETBSD  = "NetBSD";
+    public static final String PLATFORM_LINUX   = "Linux";
+    public static final String PLATFORM_SOLARIS = "Solaris";
+    public static final String PLATFORM_AIX     = "AIX";
+    public static final String PLATFORM_IRIX    = "IRIX";
+    public static final String PLATFORM_FREEBSD = "FreeBSD";
+    public static final String PLATFORM_TRU64   = "Tru64";
+    public static final String PLATFORM_ARM     = "ARM"; // ARM architecture ABI
+    public static final String PLATFORM_EMBEDDED = "Embedded"; // Stand-alone (embedded) ABI
+    public static final String PLATFORM_WINDOWS = "Windows";
+    
+    public static Property PLATFORM = Property.internalClosedChoise(PREFIX+"platform", 
+          PLATFORM_SYSV, PLATFORM_HPUX, PLATFORM_NETBSD, PLATFORM_LINUX,
+                         PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, PLATFORM_FREEBSD, PLATFORM_TRU64,
+                         PLATFORM_ARM, PLATFORM_EMBEDDED, PLATFORM_WINDOWS);
+    
+    public static final String MACHINE_x86_32 = "x86-32";
+    public static final String MACHINE_x86_64 = "x86-64";
+    public static final String MACHINE_IA_64  = "IA-64";
+    public static final String MACHINE_SPARC  = "SPARC";
+    public static final String MACHINE_M68K   = "Motorola-68000";
+    public static final String MACHINE_M88K   = "Motorola-88000";
+    public static final String MACHINE_MIPS   = "MIPS";
+    public static final String MACHINE_PPC    = "PPC";
+    public static final String MACHINE_S370   = "S370";
+    public static final String MACHINE_S390   = "S390";
+    public static final String MACHINE_ARM    = "ARM";
+    public static final String MACHINE_VAX    = "Vax";
+    public static final String MACHINE_ALPHA  = "Alpha";
+    public static final String MACHINE_EFI    = "EFI"; // EFI ByteCode
+    public static final String MACHINE_M32R   = "M32R";
+    public static final String MACHINE_SH3    = "SH3";
+    public static final String MACHINE_SH4    = "SH4";
+    public static final String MACHINE_SH5    = "SH5";
+    public static final String MACHINE_UNKNOWN = "Unknown";
+    
+    public static Property MACHINE_TYPE = Property.internalClosedChoise(PREFIX+"machineType", 
+           MACHINE_x86_32, MACHINE_x86_64, MACHINE_IA_64, MACHINE_SPARC,
+           MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, MACHINE_PPC,
+           MACHINE_S370, MACHINE_S390,
+           MACHINE_ARM, MACHINE_VAX, MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R,
+           MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, MACHINE_UNKNOWN);
+    
+    public static final class Endian {
+        private String name;
+        private boolean msb;
+        public String getName() { return name; }
+        @SuppressWarnings("unused")
+        public boolean isMSB() { return msb; }
+        @SuppressWarnings("unused")
+        public String getMSB() { if(msb) { return "MSB"; } else { return "LSB"; } }
+        private Endian(String name, boolean msb) { this.name = name; this.msb = msb; }
+       
+        public static final Endian LITTLE = new Endian("Little", false);
+        public static final Endian BIG = new Endian("Big", true);
+    }
+    public static Property ENDIAN = Property.internalClosedChoise(PREFIX+"endian", 
+          Endian.LITTLE.name, Endian.BIG.name);
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,19 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.asm.ClassParser
+org.apache.tika.parser.code.SourceCodeParser
+org.apache.tika.parser.executable.ExecutableParser

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+/**
+ * Test case for parsing Java class files.
+ */
+public class ClassParserTest {
+
+    @Test
+    public void testClassParsing() throws Exception {
+        String path = "/test-documents/AutoDetectParser.class";
+        Metadata metadata = new Metadata();
+        String content = new Tika().parseToString(
+                ClassParserTest.class.getResourceAsStream(path), metadata);
+
+        assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(
+                "AutoDetectParser.class",
+                metadata.get(Metadata.RESOURCE_NAME_KEY));
+
+        assertTrue(content.contains("package org.apache.tika.parser;"));
+        assertTrue(content.contains(
+                "class AutoDetectParser extends CompositeParser"));
+        assertTrue(content.contains(
+                "private org.apache.tika.mime.MimeTypes types"));
+        assertTrue(content.contains(
+                "public void parse("
+                + "java.io.InputStream, org.xml.sax.ContentHandler,"
+                + " org.apache.tika.metadata.Metadata) throws"
+                + " java.io.IOException, org.xml.sax.SAXException,"
+                + " org.apache.tika.exception.TikaException;"));
+        assertTrue(content.contains(
+                "private byte[] getPrefix(java.io.InputStream, int)"
+                + " throws java.io.IOException;"));
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class SourceCodeParserTest extends TikaTest {
+
+  private SourceCodeParser sourceCodeParser = new SourceCodeParser();
+
+  @Test
+  public void testSupportTypes() throws Exception {
+    Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
+    assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
+
+    assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
+  }
+
+  @Test
+  public void testHTMLRenderWithReturnLine() throws Exception {
+    String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
+    
+    assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+    assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+  }
+  
+  @Test
+  public void testTextRender() throws Exception {
+    String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+    
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
+    
+    textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
+  }
+
+  @Test
+  public void testLoC() throws Exception {
+    Metadata metadata = createMetadata("text/x-groovy");
+    getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
+
+    assertEquals(metadata.get("LoC"), "9");
+  }
+
+  @Test
+  public void testAuthor() throws Exception {
+    Metadata metadata = createMetadata("text/x-c++src");
+    getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+
+    assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
+  }
+
+  @Test
+  public void testReturnContentAsIsForTextHandler() throws Exception {
+    String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+
+    assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
+  }
+
+  private Metadata createMetadata(String mimeType) {
+    Metadata metadata = new Metadata();
+    metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
+    metadata.add(Metadata.CONTENT_TYPE, mimeType);
+    return metadata;
+  }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExecutableParserTest {
+
+    @Test
+    public void testWin32Parser() throws Exception {
+        try (InputStream input = ExecutableParserTest.class.getResourceAsStream(
+                "/test-documents/testWindows-x86-32.exe")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("application/x-msdownload",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("2012-05-13T13:40:11Z",
+                    metadata.get(Metadata.CREATION_DATE));
+
+            assertEquals(ExecutableParser.MACHINE_x86_32,
+                    metadata.get(ExecutableParser.MACHINE_TYPE));
+            assertEquals("Little",
+                  metadata.get(ExecutableParser.ENDIAN));
+            assertEquals("32",
+                  metadata.get(ExecutableParser.ARCHITECTURE_BITS));
+            assertEquals("Windows",
+                  metadata.get(ExecutableParser.PLATFORM));
+
+            String content = handler.toString();
+            assertEquals("", content); // No text yet
+        }
+    }
+    
+    @Test
+    public void testElfParser_x86_32() throws Exception {
+        try (InputStream input = ExecutableParserTest.class.getResourceAsStream(
+                "/test-documents/testLinux-x86-32")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("application/x-executable",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals(ExecutableParser.MACHINE_x86_32,
+                    metadata.get(ExecutableParser.MACHINE_TYPE));
+            assertEquals("Little",
+                    metadata.get(ExecutableParser.ENDIAN));
+            assertEquals("32",
+                    metadata.get(ExecutableParser.ARCHITECTURE_BITS));
+//         assertEquals("Linux",
+//               metadata.get(ExecutableParser.PLATFORM));
+
+            String content = handler.toString();
+            assertEquals("", content); // No text yet
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-database-parser-module</artifactId>
+  <name>Apache Tika Database Parser Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <!-- Provided dependencies -->
+    <dependency>
+      <groupId>org.xerial</groupId>
+      <artifactId>sqlite-jdbc</artifactId>
+      <version>3.8.10.1</version> 
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-office-parser-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/appended-resources/META-INF/LICENSE
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/appended-resources/META-INF/LICENSE?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/appended-resources/META-INF/LICENSE (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/appended-resources/META-INF/LICENSE Sat Jan 16 18:23:01 2016
@@ -0,0 +1,9 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Sqlite (included in the "provided" org.xerial's sqlite-jdbc)
+    Sqlite is in the Public Domain.  For details
+    see: https://www.sqlite.org/copyright.html

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,189 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract class that handles iterating through tables within a database.
+ */
+abstract class AbstractDBParser extends AbstractParser {
+
+    private final static byte[] EMPTY_BYTE_ARR = new byte[0];
+
+    private Connection connection;
+
+    protected static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
+        return context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return null;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        connection = getConnection(stream, metadata, context);
+        XHTMLContentHandler xHandler = null;
+        List<String> tableNames = null;
+        try {
+            tableNames = getTableNames(connection, metadata, context);
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        for (String tableName : tableNames) {
+            //add table names to parent metadata
+            metadata.add(Database.TABLE_NAME, tableName);
+        }
+        xHandler = new XHTMLContentHandler(handler, metadata);
+        xHandler.startDocument();
+
+        try {
+            for (String tableName : tableNames) {
+                JDBCTableReader tableReader = getTableReader(connection, tableName, context);
+                xHandler.startElement("table", "name", tableReader.getTableName());
+                xHandler.startElement("thead");
+                xHandler.startElement("tr");
+                for (String header : tableReader.getHeaders()) {
+                    xHandler.startElement("th");
+                    xHandler.characters(header);
+                    xHandler.endElement("th");
+                }
+                xHandler.endElement("tr");
+                xHandler.endElement("thead");
+                xHandler.startElement("tbody");
+                while (tableReader.nextRow(xHandler, context)) {
+                    //no-op
+                }
+                xHandler.endElement("tbody");
+                xHandler.endElement("table");
+            }
+        } finally {
+            if (xHandler != null) {
+                xHandler.endDocument();
+            }
+            try {
+                close();
+            } catch (SQLException e) {
+                //swallow
+            }
+        }
+    }
+
+    /**
+     * Override this for any special handling of closing the connection.
+     *
+     * @throws java.sql.SQLException
+     * @throws java.io.IOException
+     */
+    protected void close() throws SQLException, IOException {
+        connection.close();
+    }
+
+    /**
+     * Override this for special configuration of the connection, such as limiting
+     * the number of rows to be held in memory.
+     *
+     * @param stream   stream to use
+     * @param metadata metadata that could be used in parameterizing the connection
+     * @param context  parsecontext that could be used in parameterizing the connection
+     * @return connection
+     * @throws java.io.IOException
+     * @throws org.apache.tika.exception.TikaException
+     */
+    protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException {
+        String connectionString = getConnectionString(stream, metadata, context);
+
+        Connection connection = null;
+        try {
+            Class.forName(getJDBCClassName());
+        } catch (ClassNotFoundException e) {
+            throw new TikaException(e.getMessage());
+        }
+        try {
+            connection = DriverManager.getConnection(connectionString);
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        return connection;
+    }
+
+    /**
+     * Implement for db specific connection information, e.g. "jdbc:sqlite:/docs/mydb.db"
+     * <p/>
+     * Include any optimization settings, user name, password, etc.
+     * <p/>
+     *
+     * @param stream       stream for processing
+     * @param metadata     metadata might be useful in determining connection info
+     * @param parseContext context to use to help create connectionString
+     * @return connection string to be used by {@link #getConnection}.
+     * @throws java.io.IOException
+     */
+    abstract protected String getConnectionString(InputStream stream,
+                                                  Metadata metadata, ParseContext parseContext) throws IOException;
+
+    /**
+     * JDBC class name, e.g. org.sqlite.JDBC
+     *
+     * @return jdbc class name
+     */
+    abstract protected String getJDBCClassName();
+
+    /**
+     * Returns the names of the tables to process
+     *
+     * @param connection Connection to use to make the sql call(s) to get the names of the tables
+     * @param metadata   Metadata to use (potentially) in decision about which tables to extract
+     * @param context    ParseContext to use (potentially) in decision about which tables to extract
+     * @return
+     * @throws java.sql.SQLException
+     */
+    abstract protected List<String> getTableNames(Connection connection, Metadata metadata,
+                                                  ParseContext context) throws SQLException;
+
+    /**
+     * Given a connection and a table name, return the JDBCTableReader for this db.
+     *
+     * @param connection
+     * @param tableName
+     * @return
+     */
+    abstract protected JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext parseContext);
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,302 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Blob;
+import java.sql.Clob;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * General base class to iterate through rows of a JDBC table
+ */
+class JDBCTableReader {
+
+    private final static Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+    private final Connection connection;
+    private final String tableName;
+    int maxClobLength = 1000000;
+    ResultSet results = null;
+    int rows = 0;
+    private TikaConfig tikaConfig = null;
+    private Detector detector = null;
+    private MimeTypes mimeTypes = null;
+
+    public JDBCTableReader(Connection connection, String tableName, ParseContext context) {
+        this.connection = connection;
+        this.tableName = tableName;
+        this.tikaConfig = context.get(TikaConfig.class);
+    }
+
+    public boolean nextRow(ContentHandler handler, ParseContext context) throws IOException, SAXException {
+        //lazy initialization
+        if (results == null) {
+            reset();
+        }
+        try {
+            if (!results.next()) {
+                return false;
+            }
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        try {
+            ResultSetMetaData meta = results.getMetaData();
+            handler.startElement(XHTMLContentHandler.XHTML, "tr", "tr", EMPTY_ATTRIBUTES);
+            for (int i = 1; i <= meta.getColumnCount(); i++) {
+                handler.startElement(XHTMLContentHandler.XHTML, "td", "td", EMPTY_ATTRIBUTES);
+                handleCell(meta, i, handler, context);
+                handler.endElement(XHTMLContentHandler.XHTML, "td", "td");
+            }
+            handler.endElement(XHTMLContentHandler.XHTML, "tr", "tr");
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        rows++;
+        return true;
+    }
+
+    private void handleCell(ResultSetMetaData rsmd, int i, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+        switch (rsmd.getColumnType(i)) {
+            case Types.BLOB:
+                handleBlob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context);
+                break;
+            case Types.CLOB:
+                handleClob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context);
+                break;
+            case Types.BOOLEAN:
+                handleBoolean(results.getBoolean(i), handler);
+                break;
+            case Types.DATE:
+                handleDate(results, i, handler);
+                break;
+            case Types.TIMESTAMP:
+                handleTimeStamp(results, i, handler);
+                break;
+            case Types.INTEGER:
+                handleInteger(rsmd.getColumnTypeName(i), results, i, handler);
+                break;
+            case Types.FLOAT:
+                //this is necessary to handle rounding issues in presentation
+                //Should we just use getString(i)?
+                addAllCharacters(Float.toString(results.getFloat(i)), handler);
+                break;
+            case Types.DOUBLE:
+                addAllCharacters(Double.toString(results.getDouble(i)), handler);
+                break;
+            default:
+                addAllCharacters(results.getString(i), handler);
+                break;
+        }
+    }
+
+    public List<String> getHeaders() throws IOException {
+        List<String> headers = new LinkedList<String>();
+        //lazy initialization
+        if (results == null) {
+            reset();
+        }
+        try {
+            ResultSetMetaData meta = results.getMetaData();
+            for (int i = 1; i <= meta.getColumnCount(); i++) {
+                headers.add(meta.getColumnName(i));
+            }
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        return headers;
+    }
+
+    protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex, ContentHandler handler) throws SQLException, SAXException {
+        addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler);
+    }
+
+    private void handleBoolean(boolean aBoolean, ContentHandler handler) throws SAXException {
+        addAllCharacters(Boolean.toString(aBoolean), handler);
+    }
+
+
+    protected void handleClob(String tableName, String columnName, int rowNum,
+                              ResultSet resultSet, int columnIndex,
+                              ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+        Clob clob = resultSet.getClob(columnIndex);
+        boolean truncated = clob.length() > Integer.MAX_VALUE || clob.length() > maxClobLength;
+
+        int readSize = (clob.length() < maxClobLength ? (int) clob.length() : maxClobLength);
+        Metadata m = new Metadata();
+        m.set(Database.TABLE_NAME, tableName);
+        m.set(Database.COLUMN_NAME, columnName);
+        m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
+        m.set(Database.PREFIX + "IS_CLOB", "true");
+        m.set(Database.PREFIX + "CLOB_LENGTH", Long.toString(clob.length()));
+        m.set(Database.PREFIX + "IS_CLOB_TRUNCATED", Boolean.toString(truncated));
+        m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-8");
+        m.set(Metadata.CONTENT_LENGTH, Integer.toString(readSize));
+        m.set(TikaMetadataKeys.RESOURCE_NAME_KEY,
+                //just in case something screwy is going on with the column name
+                FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + ".txt")));
+
+
+        //is there a more efficient way to go from a Reader to an InputStream?
+        String s = clob.getSubString(0, readSize);
+        EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
+        ex.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true);
+    }
+
+    protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex,
+                              ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+        Metadata m = new Metadata();
+        m.set(Database.TABLE_NAME, tableName);
+        m.set(Database.COLUMN_NAME, columnName);
+        m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
+        m.set(Database.PREFIX + "IS_BLOB", "true");
+        Blob blob = null;
+        InputStream is = null;
+        EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
+        try {
+            is = TikaInputStream.get(getInputStreamFromBlob(resultSet, columnIndex, blob, m));
+
+            Attributes attrs = new AttributesImpl();
+            ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+            ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName);
+            ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum));
+            handler.startElement("", "span", "span", attrs);
+            MediaType mediaType = getDetector().detect(is, new Metadata());
+            String extension = "";
+            try {
+                MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+                m.set(Metadata.CONTENT_TYPE, mimeType.toString());
+                extension = mimeType.getExtension();
+            } catch (MimeTypeException e) {
+                //swallow
+            }
+            m.set(TikaMetadataKeys.RESOURCE_NAME_KEY,
+                    //just in case something screwy is going on with the column name
+                    FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension)));
+
+            ex.parseEmbedded(is, handler, m, true);
+
+        } finally {
+            if (blob != null) {
+                try {
+                    blob.free();
+                } catch (SQLException e) {
+                    //swallow
+                }
+            }
+            IOUtils.closeQuietly(is);
+        }
+        handler.endElement("", "span", "span");
+    }
+
+    protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata metadata) throws SQLException {
+        return TikaInputStream.get(blob, metadata);
+    }
+
+    protected void handleDate(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException {
+        addAllCharacters(resultSet.getString(columnIndex), handler);
+    }
+
+    protected void handleTimeStamp(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException {
+        addAllCharacters(resultSet.getString(columnIndex), handler);
+    }
+
+    protected void addAllCharacters(String s, ContentHandler handler) throws SAXException {
+        char[] chars = s.toCharArray();
+        handler.characters(chars, 0, chars.length);
+    }
+
+    void reset() throws IOException {
+
+        if (results != null) {
+            try {
+                results.close();
+            } catch (SQLException e) {
+                //swallow
+            }
+        }
+
+        String sql = "SELECT * from " + tableName;
+        try {
+            Statement st = connection.createStatement();
+            results = st.executeQuery(sql);
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        rows = 0;
+    }
+
+    public String getTableName() {
+        return tableName;
+    }
+
+
+    protected TikaConfig getTikaConfig() {
+        if (tikaConfig == null) {
+            tikaConfig = TikaConfig.getDefaultConfig();
+        }
+        return tikaConfig;
+    }
+
+    protected Detector getDetector() {
+        if (detector != null) return detector;
+
+        detector = getTikaConfig().getDetector();
+        return detector;
+    }
+
+    protected MimeTypes getMimeTypes() {
+        if (mimeTypes != null) return mimeTypes;
+
+        mimeTypes = getTikaConfig().getMimeRepository();
+        return mimeTypes;
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,110 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.sqlite.SQLiteConfig;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class SQLite3DBParser extends AbstractDBParser {
+
+    protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+    /**
+     * @param context context
+     * @return null (always)
+     */
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return null;
+    }
+
+    @Override
+    protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException {
+        String connectionString = getConnectionString(stream, metadata, context);
+
+        Connection connection = null;
+        try {
+            Class.forName(getJDBCClassName());
+        } catch (ClassNotFoundException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        try {
+            SQLiteConfig config = new SQLiteConfig();
+
+            //good habit, but effectively meaningless here
+            config.setReadOnly(true);
+            connection = config.createConnection(connectionString);
+
+        } catch (SQLException e) {
+            throw new IOException(e.getMessage());
+        }
+        return connection;
+    }
+
+    @Override
+    protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) throws IOException {
+        File dbFile = TikaInputStream.get(is).getFile();
+        return "jdbc:sqlite:" + dbFile.getAbsolutePath();
+    }
+
+    @Override
+    protected String getJDBCClassName() {
+        return SQLITE_CLASS_NAME;
+    }
+
+    @Override
+    protected List<String> getTableNames(Connection connection, Metadata metadata,
+                                         ParseContext context) throws SQLException {
+        List<String> tableNames = new LinkedList<String>();
+
+        try (Statement st = connection.createStatement()) {
+            String sql = "SELECT name FROM sqlite_master WHERE type='table'";
+            ResultSet rs = st.executeQuery(sql);
+
+            while (rs.next()) {
+                tableNames.add(rs.getString(1));
+            }
+        }
+        return tableNames;
+    }
+
+    @Override
+    public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) {
+        return new SQLite3TableReader(connection, tableName, context);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-parser-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,80 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is the main class for parsing SQLite3 files.  When {@link #parse} is called,
+ * this creates a new {@link org.apache.tika.parser.jdbc.SQLite3DBParser}.
+ * <p/>
+ * Given potential conflicts of native libraries in web servers, users will
+ * need to add org.xerial's sqlite-jdbc jar to the class path for this parser
+ * to work.  For development and testing, this jar is specified in tika-parsers'
+ * pom.xml, but it is currently set to "provided."
+ * <p/>
+ * Note that this family of jdbc parsers is designed to treat each CLOB and each BLOB
+ * as embedded documents.
+ */
+public class SQLite3Parser extends AbstractParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-sqlite3");
+
+    private final Set<MediaType> SUPPORTED_TYPES;
+
+    /**
+     * Checks to see if class is available for org.sqlite.JDBC.
+     * <p/>
+     * If not, this class will return an EMPTY_SET for  getSupportedTypes()
+     */
+    public SQLite3Parser() {
+        Set<MediaType> tmp;
+        try {
+            Class.forName(SQLite3DBParser.SQLITE_CLASS_NAME);
+            tmp = Collections.singleton(MEDIA_TYPE);
+        } catch (ClassNotFoundException e) {
+            tmp = Collections.EMPTY_SET;
+        }
+        SUPPORTED_TYPES = tmp;
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        SQLite3DBParser p = new SQLite3DBParser();
+        p.parse(stream, handler, metadata, context);
+    }
+}