You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [3/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ tik...
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-code-module</artifactId>
+ <name>Apache Tika Code Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm</artifactId>
+ <version>5.0.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codelibs</groupId>
+ <artifactId>jhighlight</artifactId>
+ <version>1.0.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3531388963354454357L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("java-vm"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ new XHTMLClassVisitor(handler, metadata).parse(stream);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.objectweb.asm.AnnotationVisitor;
+import org.objectweb.asm.Attribute;
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+import org.objectweb.asm.Type;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Class visitor that generates XHTML SAX events to describe the
+ * contents of the visited class.
+ */
+class XHTMLClassVisitor extends ClassVisitor {
+
+ private final XHTMLContentHandler xhtml;
+
+ private final Metadata metadata;
+
+ private Type type;
+
+ private String packageName;
+
+ public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
+ super(Opcodes.ASM5);
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.metadata = metadata;
+ }
+
+ public void parse(InputStream stream)
+ throws TikaException, SAXException, IOException {
+ try {
+ ClassReader reader = new ClassReader(stream);
+ reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
+ } catch (RuntimeException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Failed to parse a Java class", e);
+ }
+ }
+ }
+
+ public void visit(
+ int version, int access, String name, String signature,
+ String superName, String[] interfaces) {
+ type = Type.getObjectType(name);
+
+ String className = type.getClassName();
+ int dot = className.lastIndexOf('.');
+ if (dot != -1) {
+ packageName = className.substring(0, dot);
+ className = className.substring(dot + 1);
+ }
+
+ metadata.set(TikaCoreProperties.TITLE, className);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
+
+ try {
+ xhtml.startDocument();
+ xhtml.startElement("pre");
+
+ if (packageName != null) {
+ writeKeyword("package");
+ xhtml.characters(" " + packageName + ";\n");
+ }
+
+ writeAccess(access);
+ if (isSet(access, Opcodes.ACC_INTERFACE)) {
+ writeKeyword("interface");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ writeInterfaces("extends", interfaces);
+ } else if (isSet(access, Opcodes.ACC_ENUM)) {
+ writeKeyword("enum");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ } else {
+ writeKeyword("class");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ if (superName != null) {
+ Type superType = Type.getObjectType(superName);
+ if (!superType.getClassName().equals("java.lang.Object")) {
+ writeKeyword("extends");
+ writeSpace();
+ writeType(superType);
+ writeSpace();
+ }
+ }
+ writeInterfaces("implements", interfaces);
+ }
+ xhtml.characters("{\n");
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void writeInterfaces(String keyword, String[] interfaces)
+ throws SAXException {
+ if (interfaces != null && interfaces.length > 0) {
+ writeKeyword(keyword);
+ String separator = " ";
+ for (String iface : interfaces) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(iface));
+ separator = ", ";
+ }
+ writeSpace();
+ }
+ }
+
+ public void visitEnd() {
+ try {
+ xhtml.characters("}\n");
+ xhtml.endElement("pre");
+ xhtml.endDocument();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitOuterClass(String owner, String name, String desc) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitSource(String source, String debug) {
+ }
+
+
+ /**
+ * Ignored.
+ */
+ public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
+ return null;
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitAttribute(Attribute attr) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitInnerClass(
+ String name, String outerName, String innerName, int access) {
+ }
+
+ /**
+ * Visits a field.
+ */
+ public FieldVisitor visitField(
+ int access, String name, String desc, String signature,
+ Object value) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getType(desc));
+ writeSpace();
+ writeIdentifier(name);
+
+ if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
+ xhtml.characters(" = ");
+ xhtml.characters(value.toString());
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Visits a method.
+ */
+ public MethodVisitor visitMethod(
+ int access, String name, String desc, String signature,
+ String[] exceptions) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getReturnType(desc));
+ writeSpace();
+ if ("<init>".equals(name)) {
+ writeType(type);
+ } else {
+ writeIdentifier(name);
+ }
+
+ xhtml.characters("(");
+ String separator = "";
+ for (Type arg : Type.getArgumentTypes(desc)) {
+ xhtml.characters(separator);
+ writeType(arg);
+ separator = ", ";
+ }
+ xhtml.characters(")");
+
+ if (exceptions != null && exceptions.length > 0) {
+ writeSpace();
+ writeKeyword("throws");
+ separator = " ";
+ for (String exception : exceptions) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(exception));
+ separator = ", ";
+ }
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ private void writeIdentifier(String identifier) throws SAXException {
+ xhtml.startElement("span", "class", "java-identifier");
+ xhtml.characters(identifier);
+ xhtml.endElement("span");
+ }
+
+ private void writeKeyword(String keyword) throws SAXException {
+ xhtml.startElement("span", "class", "java-keyword");
+ xhtml.characters(keyword);
+ xhtml.endElement("span");
+ }
+
+ private void writeSemicolon() throws SAXException {
+ xhtml.characters(";");
+ }
+
+ private void writeSpace() throws SAXException {
+ xhtml.characters(" ");
+ }
+
+ private void writeNewline() throws SAXException {
+ xhtml.characters("\n");
+ }
+
+ private void writeAccess(int access) throws SAXException {
+ writeAccess(access, Opcodes.ACC_PRIVATE, "private");
+ writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
+ writeAccess(access, Opcodes.ACC_PUBLIC, "public");
+ writeAccess(access, Opcodes.ACC_STATIC, "static");
+ writeAccess(access, Opcodes.ACC_FINAL, "final");
+ writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
+ writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
+ writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
+ writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
+ writeAccess(access, Opcodes.ACC_NATIVE, "native");
+ }
+
+ private void writeAccess(int access, int code, String keyword)
+ throws SAXException {
+ if (isSet(access, code)) {
+ writeKeyword(keyword);
+ xhtml.characters(" ");
+ }
+ }
+
+ private void writeType(Type type) throws SAXException {
+ String name = type.getClassName();
+ if (name.startsWith(packageName + ".")) {
+ xhtml.characters(name.substring(packageName.length() + 1));
+ } else if (name.startsWith("java.lang.")) {
+ xhtml.characters(name.substring("java.lang.".length()));
+ } else {
+ xhtml.characters(name);
+ }
+ }
+
+ private static boolean isSet(int value, int flag) {
+ return (value & flag) != 0;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.uwyn.jhighlight.renderer.Renderer;
+import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
+/**
+ * Generic Source code parser for Java, Groovy, C++.
+ * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
+ *
+ * @author Hong-Thai.Nguyen
+ * @since 1.6
+ */
+public class SourceCodeParser implements Parser {
+
+ private static final long serialVersionUID = -4543476498190054160L;
+
+ private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
+
+ private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
+ private static final long serialVersionUID = -741976157563751152L;
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
+
+ private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
+
+ //Parse the HTML document
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return TYPES_TO_RENDERER.keySet();
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata,
+ context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (mediaType != null && name != null) {
+ MediaType type = MediaType.parse(mediaType);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ StringBuilder out = new StringBuilder();
+ String line;
+ int nbLines = 0;
+ while ((line = reader.readLine()) != null) {
+ out.append(line + System.getProperty("line.separator"));
+ String author = parserAuthor(line);
+ if (author != null) {
+ metadata.add(TikaCoreProperties.CREATOR, author);
+ }
+ nbLines ++;
+ }
+ metadata.set("LoC", String.valueOf(nbLines));
+ Renderer renderer = getRenderer(type.toString());
+
+ String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+ parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ parser.setContentHandler(handler);
+ parser.parse(new InputSource(new StringReader(codeAsHtml)));
+ }
+ }
+
+ }
+
+ private Renderer getRenderer(String mimeType) {
+ MediaType mt = MediaType.parse(mimeType);
+ String type = TYPES_TO_RENDERER.get(mt);
+ if (type == null) {
+ throw new RuntimeException("unparseable content type " + mimeType);
+ }
+ return XhtmlRendererFactory.getRenderer(type);
+ }
+
+
+ private String parserAuthor(String line) {
+ Matcher m = authorPattern.matcher(line);
+ if (m.find()) {
+ return m.group(1).trim();
+ }
+
+ return null;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Date;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for executable files. Currently supports ELF and PE
+ */
+public class ExecutableParser extends AbstractParser implements MachineMetadata {
+ /** Serial version UID */
+ private static final long serialVersionUID = 32128791892482l;
+
+ private static final MediaType PE_EXE = MediaType.application("x-msdownload");
+ private static final MediaType ELF_GENERAL = MediaType.application("x-elf");
+ private static final MediaType ELF_OBJECT = MediaType.application("x-object");
+ private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
+ private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
+ private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ PE_EXE,
+ ELF_GENERAL,
+ ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
+ )));
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // We only do metadata, for now
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ // What kind is it?
+ byte[] first4 = new byte[4];
+ IOUtils.readFully(stream, first4);
+
+ if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
+ parsePE(xhtml, metadata, stream, first4);
+ } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
+ first4[2] == (byte)'L' && first4[3] == (byte)'F') {
+ parseELF(xhtml, metadata, stream, first4);
+ }
+
+
+ // Finish everything
+ xhtml.endDocument();
+ }
+
+ /**
+ * Parses a DOS or Windows PE file
+ */
+ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
+ InputStream stream, byte[] first4) throws TikaException, IOException {
+ metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString());
+ metadata.set(PLATFORM, PLATFORM_WINDOWS);
+
+ // Skip over the MS-DOS bit
+ byte[] msdosSection = new byte[0x3c-4];
+ IOUtils.readFully(stream, msdosSection);
+
+ // Grab the PE header offset
+ int peOffset = LittleEndian.readInt(stream);
+
+ // Sanity check - while it may go anywhere, it's normally in the first few kb
+ if (peOffset > 4096 || peOffset < 0x3f) return;
+
+ // Skip the rest of the MS-DOS stub (if PE), until we reach what should
+ // be the PE header (if this is a PE executable)
+ stream.skip(peOffset - 0x40);
+
+ // Read the PE header
+ byte[] pe = new byte[24];
+ IOUtils.readFully(stream, pe);
+
+ // Check it really is a PE header
+ if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
+ // Good, has a valid PE signature
+ } else {
+ // Old style MS-DOS
+ return;
+ }
+
+ // Read the header values
+ int machine = LittleEndian.getUShort(pe, 4);
+ int numSectors = LittleEndian.getUShort(pe, 6);
+ long createdAt = LittleEndian.getInt(pe, 8);
+ long symbolTableOffset = LittleEndian.getInt(pe, 12);
+ long numSymbols = LittleEndian.getInt(pe, 16);
+ int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
+ int characteristcs = LittleEndian.getUShort(pe, 22);
+
+ // Turn this into helpful metadata
+ Date createdAtD = new Date(createdAt*1000l);
+ metadata.set(Metadata.CREATION_DATE, createdAtD);
+
+ switch(machine) {
+ case 0x14c:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x8664:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
+ case 0x200:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
+
+ case 0x184:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x284:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "64");
+ break;
+
+ case 0x1c0:
+ case 0x1c4:
+ metadata.set(MACHINE_TYPE, MACHINE_ARM);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x268:
+ metadata.set(MACHINE_TYPE, MACHINE_M68K);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x266:
+ case 0x366:
+ case 0x466:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "16");
+ break;
+ case 0x162:
+ case 0x166:
+ case 0x168:
+ case 0x169:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "16");
+ break;
+
+ case 0x1f0:
+ case 0x1f1:
+ metadata.set(MACHINE_TYPE, MACHINE_PPC);
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x1a2:
+ case 0x1a3:
+ metadata.set(MACHINE_TYPE, MACHINE_SH3);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x1a6:
+ metadata.set(MACHINE_TYPE, MACHINE_SH4);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+ case 0x1a8:
+ metadata.set(MACHINE_TYPE, MACHINE_SH3);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0x9041:
+ metadata.set(MACHINE_TYPE, MACHINE_M32R);
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ metadata.set(ARCHITECTURE_BITS, "32");
+ break;
+
+ case 0xebc:
+ metadata.set(MACHINE_TYPE, MACHINE_EFI);
+ break;
+
+ default:
+ metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
+ break;
+ }
+ }
+
+ /**
+ * Parses a Unix ELF file
+ */
+ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
+ InputStream stream, byte[] first4) throws TikaException, IOException {
+ // Byte 5 is the architecture
+ int architecture = stream.read();
+ if (architecture == 1) {
+ metadata.set(ARCHITECTURE_BITS, "32");
+ } else if (architecture == 2) {
+ metadata.set(ARCHITECTURE_BITS, "64");
+ }
+
+ // Byte 6 is the endian-ness
+ int endian = stream.read();
+ if (endian == 1) {
+ metadata.set(ENDIAN, Endian.LITTLE.getName());
+ } else if (endian == 2) {
+ metadata.set(ENDIAN, Endian.BIG.getName());
+ }
+
+ // Byte 7 is the elf version
+ int elfVer = stream.read();
+
+ // Byte 8 is the OS, if set (lots of compilers don't)
+ // Byte 9 is the OS (specific) ABI version
+ int os = stream.read();
+ int osVer = stream.read();
+ if (os > 0 || osVer > 0)
+ {
+ switch (os) {
+ case 0:
+ metadata.set(PLATFORM, PLATFORM_SYSV);
+ break;
+
+ case 1:
+ metadata.set(PLATFORM, PLATFORM_HPUX);
+ break;
+
+ case 2:
+ metadata.set(PLATFORM, PLATFORM_NETBSD);
+ break;
+
+ case 3:
+ metadata.set(PLATFORM, PLATFORM_LINUX);
+ break;
+
+ case 6:
+ metadata.set(PLATFORM, PLATFORM_SOLARIS);
+ break;
+
+ case 7:
+ metadata.set(PLATFORM, PLATFORM_AIX);
+ break;
+
+ case 8:
+ metadata.set(PLATFORM, PLATFORM_IRIX);
+ break;
+
+ case 9:
+ metadata.set(PLATFORM, PLATFORM_FREEBSD);
+ break;
+
+ case 10:
+ metadata.set(PLATFORM, PLATFORM_TRU64);
+ break;
+
+ case 12:
+ metadata.set(PLATFORM, PLATFORM_FREEBSD);
+ break;
+
+ case 64:
+ case 97:
+ metadata.set(PLATFORM, PLATFORM_ARM);
+ break;
+
+ case 255:
+ metadata.set(PLATFORM, PLATFORM_EMBEDDED);
+ break;
+ }
+ }
+
+ // Bytes 10-16 are padding and lengths
+ byte[] padLength = new byte[7];
+ IOUtils.readFully(stream, padLength);
+
+ // Bytes 16-17 are the object type (LE/BE)
+ int type;
+ if (endian == 1) {
+ type = EndianUtils.readUShortLE(stream);
+ } else {
+ type = EndianUtils.readUShortBE(stream);
+ }
+ switch(type) {
+ case 1:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
+ break;
+
+ case 2:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
+ break;
+
+ case 3:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
+ break;
+
+ case 4:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
+ break;
+
+ default:
+ metadata.add(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
+ break;
+ }
+
+ // Bytes 18-19 are the machine (EM_*)
+ int machine;
+ if (endian == 1) {
+ machine = EndianUtils.readUShortLE(stream);
+ } else {
+ machine = EndianUtils.readUShortBE(stream);
+ }
+ switch(machine) {
+ case 2:
+ case 18:
+ case 43:
+ metadata.set(MACHINE_TYPE, MACHINE_SPARC);
+ break;
+ case 3:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_32);
+ break;
+ case 4:
+ metadata.set(MACHINE_TYPE, MACHINE_M68K);
+ break;
+ case 5:
+ metadata.set(MACHINE_TYPE, MACHINE_M88K);
+ break;
+ case 8:
+ case 10:
+ metadata.set(MACHINE_TYPE, MACHINE_MIPS);
+ break;
+ case 7:
+ metadata.set(MACHINE_TYPE, MACHINE_S370);
+ break;
+ case 20:
+ case 21:
+ metadata.set(MACHINE_TYPE, MACHINE_PPC);
+ break;
+ case 22:
+ metadata.set(MACHINE_TYPE, MACHINE_S390);
+ break;
+ case 40:
+ metadata.set(MACHINE_TYPE, MACHINE_ARM);
+ break;
+ case 41:
+ case 0x9026:
+ metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
+ break;
+ case 50:
+ metadata.set(MACHINE_TYPE, MACHINE_IA_64);
+ break;
+ case 62:
+ metadata.set(MACHINE_TYPE, MACHINE_x86_64);
+ break;
+ case 75:
+ metadata.set(MACHINE_TYPE, MACHINE_VAX);
+ break;
+ case 88:
+ metadata.set(MACHINE_TYPE, MACHINE_M32R);
+ break;
+ }
+
+
+
+ // Bytes 20-23 are the version
+ // TODO
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import org.apache.tika.metadata.Property;
+
+/**
+ * Metadata for describing machines, such as their
+ * architecture, type and endian-ness
+ */
+public interface MachineMetadata {
+ public static final String PREFIX = "machine:";
+
+ public static Property ARCHITECTURE_BITS = Property.internalClosedChoise(PREFIX+"architectureBits",
+ "8", "16", "32", "64");
+
+ public static final String PLATFORM_SYSV = "System V";
+ public static final String PLATFORM_HPUX = "HP-UX";
+ public static final String PLATFORM_NETBSD = "NetBSD";
+ public static final String PLATFORM_LINUX = "Linux";
+ public static final String PLATFORM_SOLARIS = "Solaris";
+ public static final String PLATFORM_AIX = "AIX";
+ public static final String PLATFORM_IRIX = "IRIX";
+ public static final String PLATFORM_FREEBSD = "FreeBSD";
+ public static final String PLATFORM_TRU64 = "Tru64";
+ public static final String PLATFORM_ARM = "ARM"; // ARM architecture ABI
+ public static final String PLATFORM_EMBEDDED = "Embedded"; // Stand-alone (embedded) ABI
+ public static final String PLATFORM_WINDOWS = "Windows";
+
+ public static Property PLATFORM = Property.internalClosedChoise(PREFIX+"platform",
+ PLATFORM_SYSV, PLATFORM_HPUX, PLATFORM_NETBSD, PLATFORM_LINUX,
+ PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, PLATFORM_FREEBSD, PLATFORM_TRU64,
+ PLATFORM_ARM, PLATFORM_EMBEDDED, PLATFORM_WINDOWS);
+
+ public static final String MACHINE_x86_32 = "x86-32";
+ public static final String MACHINE_x86_64 = "x86-64";
+ public static final String MACHINE_IA_64 = "IA-64";
+ public static final String MACHINE_SPARC = "SPARC";
+ public static final String MACHINE_M68K = "Motorola-68000";
+ public static final String MACHINE_M88K = "Motorola-88000";
+ public static final String MACHINE_MIPS = "MIPS";
+ public static final String MACHINE_PPC = "PPC";
+ public static final String MACHINE_S370 = "S370";
+ public static final String MACHINE_S390 = "S390";
+ public static final String MACHINE_ARM = "ARM";
+ public static final String MACHINE_VAX = "Vax";
+ public static final String MACHINE_ALPHA = "Alpha";
+ public static final String MACHINE_EFI = "EFI"; // EFI ByteCode
+ public static final String MACHINE_M32R = "M32R";
+ public static final String MACHINE_SH3 = "SH3";
+ public static final String MACHINE_SH4 = "SH4";
+ public static final String MACHINE_SH5 = "SH5";
+ public static final String MACHINE_UNKNOWN = "Unknown";
+
+ public static Property MACHINE_TYPE = Property.internalClosedChoise(PREFIX+"machineType",
+ MACHINE_x86_32, MACHINE_x86_64, MACHINE_IA_64, MACHINE_SPARC,
+ MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, MACHINE_PPC,
+ MACHINE_S370, MACHINE_S390,
+ MACHINE_ARM, MACHINE_VAX, MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R,
+ MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, MACHINE_UNKNOWN);
+
+ public static final class Endian {
+ private String name;
+ private boolean msb;
+ public String getName() { return name; }
+ @SuppressWarnings("unused")
+ public boolean isMSB() { return msb; }
+ @SuppressWarnings("unused")
+ public String getMSB() { if(msb) { return "MSB"; } else { return "LSB"; } }
+ private Endian(String name, boolean msb) { this.name = name; this.msb = msb; }
+
+ public static final Endian LITTLE = new Endian("Little", false);
+ public static final Endian BIG = new Endian("Big", true);
+ }
+ public static Property ENDIAN = Property.internalClosedChoise(PREFIX+"endian",
+ Endian.LITTLE.name, Endian.BIG.name);
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.asm.ClassParser
+org.apache.tika.parser.code.SourceCodeParser
+org.apache.tika.parser.executable.ExecutableParser
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+/**
+ * Test case for parsing Java class files.
+ */
+public class ClassParserTest {
+
+ @Test
+ public void testClassParsing() throws Exception {
+ String path = "/test-documents/AutoDetectParser.class";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ ClassParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(
+ "AutoDetectParser.class",
+ metadata.get(Metadata.RESOURCE_NAME_KEY));
+
+ assertTrue(content.contains("package org.apache.tika.parser;"));
+ assertTrue(content.contains(
+ "class AutoDetectParser extends CompositeParser"));
+ assertTrue(content.contains(
+ "private org.apache.tika.mime.MimeTypes types"));
+ assertTrue(content.contains(
+ "public void parse("
+ + "java.io.InputStream, org.xml.sax.ContentHandler,"
+ + " org.apache.tika.metadata.Metadata) throws"
+ + " java.io.IOException, org.xml.sax.SAXException,"
+ + " org.apache.tika.exception.TikaException;"));
+ assertTrue(content.contains(
+ "private byte[] getPrefix(java.io.InputStream, int)"
+ + " throws java.io.IOException;"));
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class SourceCodeParserTest extends TikaTest {
+
+ private SourceCodeParser sourceCodeParser = new SourceCodeParser();
+
+ @Test
+ public void testSupportTypes() throws Exception {
+ Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
+ assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
+
+ assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
+ }
+
+ @Test
+ public void testHTMLRenderWithReturnLine() throws Exception {
+ String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
+
+ assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+ assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+ }
+
+ @Test
+ public void testTextRender() throws Exception {
+ String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
+
+ textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
+ }
+
+ @Test
+ public void testLoC() throws Exception {
+ Metadata metadata = createMetadata("text/x-groovy");
+ getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
+
+ assertEquals(metadata.get("LoC"), "9");
+ }
+
+ @Test
+ public void testAuthor() throws Exception {
+ Metadata metadata = createMetadata("text/x-c++src");
+ getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+
+ assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
+ }
+
+ @Test
+ public void testReturnContentAsIsForTextHandler() throws Exception {
+ String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+
+ assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
+ }
+
+ private Metadata createMetadata(String mimeType) {
+ Metadata metadata = new Metadata();
+ metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile");
+ metadata.add(Metadata.CONTENT_TYPE, mimeType);
+ return metadata;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.executable;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExecutableParserTest {
+
+ @Test
+ public void testWin32Parser() throws Exception {
+ try (InputStream input = ExecutableParserTest.class.getResourceAsStream(
+ "/test-documents/testWindows-x86-32.exe")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/x-msdownload",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2012-05-13T13:40:11Z",
+ metadata.get(Metadata.CREATION_DATE));
+
+ assertEquals(ExecutableParser.MACHINE_x86_32,
+ metadata.get(ExecutableParser.MACHINE_TYPE));
+ assertEquals("Little",
+ metadata.get(ExecutableParser.ENDIAN));
+ assertEquals("32",
+ metadata.get(ExecutableParser.ARCHITECTURE_BITS));
+ assertEquals("Windows",
+ metadata.get(ExecutableParser.PLATFORM));
+
+ String content = handler.toString();
+ assertEquals("", content); // No text yet
+ }
+ }
+
+ @Test
+ public void testElfParser_x86_32() throws Exception {
+ try (InputStream input = ExecutableParserTest.class.getResourceAsStream(
+ "/test-documents/testLinux-x86-32")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new ExecutableParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/x-executable",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals(ExecutableParser.MACHINE_x86_32,
+ metadata.get(ExecutableParser.MACHINE_TYPE));
+ assertEquals("Little",
+ metadata.get(ExecutableParser.ENDIAN));
+ assertEquals("32",
+ metadata.get(ExecutableParser.ARCHITECTURE_BITS));
+// assertEquals("Linux",
+// metadata.get(ExecutableParser.PLATFORM));
+
+ String content = handler.toString();
+ assertEquals("", content); // No text yet
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-database-module</artifactId>
+ <name>Apache Tika Database Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <!-- Provided dependencies -->
+ <dependency>
+ <groupId>org.xerial</groupId>
+ <artifactId>sqlite-jdbc</artifactId>
+ <version>3.8.10.1</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016
@@ -0,0 +1,9 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Sqlite (included in the "provided" org.xerial's sqlite-jdbc)
+ Sqlite is in the Public Domain. For details
+ see: https://www.sqlite.org/copyright.html
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,189 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract class that handles iterating through tables within a database.
+ */
+abstract class AbstractDBParser extends AbstractParser {
+
+ private final static byte[] EMPTY_BYTE_ARR = new byte[0];
+
+ private Connection connection;
+
+ protected static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
+ return context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return null;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ connection = getConnection(stream, metadata, context);
+ XHTMLContentHandler xHandler = null;
+ List<String> tableNames = null;
+ try {
+ tableNames = getTableNames(connection, metadata, context);
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ for (String tableName : tableNames) {
+ //add table names to parent metadata
+ metadata.add(Database.TABLE_NAME, tableName);
+ }
+ xHandler = new XHTMLContentHandler(handler, metadata);
+ xHandler.startDocument();
+
+ try {
+ for (String tableName : tableNames) {
+ JDBCTableReader tableReader = getTableReader(connection, tableName, context);
+ xHandler.startElement("table", "name", tableReader.getTableName());
+ xHandler.startElement("thead");
+ xHandler.startElement("tr");
+ for (String header : tableReader.getHeaders()) {
+ xHandler.startElement("th");
+ xHandler.characters(header);
+ xHandler.endElement("th");
+ }
+ xHandler.endElement("tr");
+ xHandler.endElement("thead");
+ xHandler.startElement("tbody");
+ while (tableReader.nextRow(xHandler, context)) {
+ //no-op
+ }
+ xHandler.endElement("tbody");
+ xHandler.endElement("table");
+ }
+ } finally {
+ if (xHandler != null) {
+ xHandler.endDocument();
+ }
+ try {
+ close();
+ } catch (SQLException e) {
+ //swallow
+ }
+ }
+ }
+
+ /**
+ * Override this for any special handling of closing the connection.
+ *
+ * @throws java.sql.SQLException
+ * @throws java.io.IOException
+ */
+ protected void close() throws SQLException, IOException {
+ connection.close();
+ }
+
+ /**
+ * Override this for special configuration of the connection, such as limiting
+ * the number of rows to be held in memory.
+ *
+ * @param stream stream to use
+ * @param metadata metadata that could be used in parameterizing the connection
+ * @param context parsecontext that could be used in parameterizing the connection
+ * @return connection
+ * @throws java.io.IOException
+ * @throws org.apache.tika.exception.TikaException
+ */
+ protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException {
+ String connectionString = getConnectionString(stream, metadata, context);
+
+ Connection connection = null;
+ try {
+ Class.forName(getJDBCClassName());
+ } catch (ClassNotFoundException e) {
+ throw new TikaException(e.getMessage());
+ }
+ try {
+ connection = DriverManager.getConnection(connectionString);
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ return connection;
+ }
+
+ /**
+ * Implement for db specific connection information, e.g. "jdbc:sqlite:/docs/mydb.db"
+ * <p/>
+ * Include any optimization settings, user name, password, etc.
+ * <p/>
+ *
+ * @param stream stream for processing
+ * @param metadata metadata might be useful in determining connection info
+ * @param parseContext context to use to help create connectionString
+ * @return connection string to be used by {@link #getConnection}.
+ * @throws java.io.IOException
+ */
+ abstract protected String getConnectionString(InputStream stream,
+ Metadata metadata, ParseContext parseContext) throws IOException;
+
+ /**
+ * JDBC class name, e.g. org.sqlite.JDBC
+ *
+ * @return jdbc class name
+ */
+ abstract protected String getJDBCClassName();
+
+ /**
+ * Returns the names of the tables to process
+ *
+ * @param connection Connection to use to make the sql call(s) to get the names of the tables
+ * @param metadata Metadata to use (potentially) in decision about which tables to extract
+ * @param context ParseContext to use (potentially) in decision about which tables to extract
+ * @return
+ * @throws java.sql.SQLException
+ */
+ abstract protected List<String> getTableNames(Connection connection, Metadata metadata,
+ ParseContext context) throws SQLException;
+
+ /**
+ * Given a connection and a table name, return the JDBCTableReader for this db.
+ *
+ * @param connection
+ * @param tableName
+ * @return
+ */
+ abstract protected JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext parseContext);
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,302 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Blob;
+import java.sql.Clob;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * General base class to iterate through rows of a JDBC table
+ */
+class JDBCTableReader {
+
+ private final static Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+ private final Connection connection;
+ private final String tableName;
+ int maxClobLength = 1000000;
+ ResultSet results = null;
+ int rows = 0;
+ private TikaConfig tikaConfig = null;
+ private Detector detector = null;
+ private MimeTypes mimeTypes = null;
+
+ public JDBCTableReader(Connection connection, String tableName, ParseContext context) {
+ this.connection = connection;
+ this.tableName = tableName;
+ this.tikaConfig = context.get(TikaConfig.class);
+ }
+
+ public boolean nextRow(ContentHandler handler, ParseContext context) throws IOException, SAXException {
+ //lazy initialization
+ if (results == null) {
+ reset();
+ }
+ try {
+ if (!results.next()) {
+ return false;
+ }
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ try {
+ ResultSetMetaData meta = results.getMetaData();
+ handler.startElement(XHTMLContentHandler.XHTML, "tr", "tr", EMPTY_ATTRIBUTES);
+ for (int i = 1; i <= meta.getColumnCount(); i++) {
+ handler.startElement(XHTMLContentHandler.XHTML, "td", "td", EMPTY_ATTRIBUTES);
+ handleCell(meta, i, handler, context);
+ handler.endElement(XHTMLContentHandler.XHTML, "td", "td");
+ }
+ handler.endElement(XHTMLContentHandler.XHTML, "tr", "tr");
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ rows++;
+ return true;
+ }
+
+ private void handleCell(ResultSetMetaData rsmd, int i, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+ switch (rsmd.getColumnType(i)) {
+ case Types.BLOB:
+ handleBlob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context);
+ break;
+ case Types.CLOB:
+ handleClob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context);
+ break;
+ case Types.BOOLEAN:
+ handleBoolean(results.getBoolean(i), handler);
+ break;
+ case Types.DATE:
+ handleDate(results, i, handler);
+ break;
+ case Types.TIMESTAMP:
+ handleTimeStamp(results, i, handler);
+ break;
+ case Types.INTEGER:
+ handleInteger(rsmd.getColumnTypeName(i), results, i, handler);
+ break;
+ case Types.FLOAT:
+ //this is necessary to handle rounding issues in presentation
+ //Should we just use getString(i)?
+ addAllCharacters(Float.toString(results.getFloat(i)), handler);
+ break;
+ case Types.DOUBLE:
+ addAllCharacters(Double.toString(results.getDouble(i)), handler);
+ break;
+ default:
+ addAllCharacters(results.getString(i), handler);
+ break;
+ }
+ }
+
+ public List<String> getHeaders() throws IOException {
+ List<String> headers = new LinkedList<String>();
+ //lazy initialization
+ if (results == null) {
+ reset();
+ }
+ try {
+ ResultSetMetaData meta = results.getMetaData();
+ for (int i = 1; i <= meta.getColumnCount(); i++) {
+ headers.add(meta.getColumnName(i));
+ }
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ return headers;
+ }
+
+ protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex, ContentHandler handler) throws SQLException, SAXException {
+ addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler);
+ }
+
+ private void handleBoolean(boolean aBoolean, ContentHandler handler) throws SAXException {
+ addAllCharacters(Boolean.toString(aBoolean), handler);
+ }
+
+
+ protected void handleClob(String tableName, String columnName, int rowNum,
+ ResultSet resultSet, int columnIndex,
+ ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+ Clob clob = resultSet.getClob(columnIndex);
+ boolean truncated = clob.length() > Integer.MAX_VALUE || clob.length() > maxClobLength;
+
+ int readSize = (clob.length() < maxClobLength ? (int) clob.length() : maxClobLength);
+ Metadata m = new Metadata();
+ m.set(Database.TABLE_NAME, tableName);
+ m.set(Database.COLUMN_NAME, columnName);
+ m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
+ m.set(Database.PREFIX + "IS_CLOB", "true");
+ m.set(Database.PREFIX + "CLOB_LENGTH", Long.toString(clob.length()));
+ m.set(Database.PREFIX + "IS_CLOB_TRUNCATED", Boolean.toString(truncated));
+ m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-8");
+ m.set(Metadata.CONTENT_LENGTH, Integer.toString(readSize));
+ m.set(TikaMetadataKeys.RESOURCE_NAME_KEY,
+ //just in case something screwy is going on with the column name
+ FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + ".txt")));
+
+
+ //is there a more efficient way to go from a Reader to an InputStream?
+ String s = clob.getSubString(0, readSize);
+ EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
+ ex.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true);
+ }
+
+ protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex,
+ ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+ Metadata m = new Metadata();
+ m.set(Database.TABLE_NAME, tableName);
+ m.set(Database.COLUMN_NAME, columnName);
+ m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
+ m.set(Database.PREFIX + "IS_BLOB", "true");
+ Blob blob = null;
+ InputStream is = null;
+ EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
+ try {
+ is = TikaInputStream.get(getInputStreamFromBlob(resultSet, columnIndex, blob, m));
+
+ Attributes attrs = new AttributesImpl();
+ ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+ ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName);
+ ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum));
+ handler.startElement("", "span", "span", attrs);
+ MediaType mediaType = getDetector().detect(is, new Metadata());
+ String extension = "";
+ try {
+ MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+ m.set(Metadata.CONTENT_TYPE, mimeType.toString());
+ extension = mimeType.getExtension();
+ } catch (MimeTypeException e) {
+ //swallow
+ }
+ m.set(TikaMetadataKeys.RESOURCE_NAME_KEY,
+ //just in case something screwy is going on with the column name
+ FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension)));
+
+ ex.parseEmbedded(is, handler, m, true);
+
+ } finally {
+ if (blob != null) {
+ try {
+ blob.free();
+ } catch (SQLException e) {
+ //swallow
+ }
+ }
+ IOUtils.closeQuietly(is);
+ }
+ handler.endElement("", "span", "span");
+ }
+
+ protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata metadata) throws SQLException {
+ return TikaInputStream.get(blob, metadata);
+ }
+
+ protected void handleDate(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException {
+ addAllCharacters(resultSet.getString(columnIndex), handler);
+ }
+
+ protected void handleTimeStamp(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException {
+ addAllCharacters(resultSet.getString(columnIndex), handler);
+ }
+
+ protected void addAllCharacters(String s, ContentHandler handler) throws SAXException {
+ char[] chars = s.toCharArray();
+ handler.characters(chars, 0, chars.length);
+ }
+
+ void reset() throws IOException {
+
+ if (results != null) {
+ try {
+ results.close();
+ } catch (SQLException e) {
+ //swallow
+ }
+ }
+
+ String sql = "SELECT * from " + tableName;
+ try {
+ Statement st = connection.createStatement();
+ results = st.executeQuery(sql);
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ rows = 0;
+ }
+
+ public String getTableName() {
+ return tableName;
+ }
+
+
+ protected TikaConfig getTikaConfig() {
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ return tikaConfig;
+ }
+
+ protected Detector getDetector() {
+ if (detector != null) return detector;
+
+ detector = getTikaConfig().getDetector();
+ return detector;
+ }
+
+ protected MimeTypes getMimeTypes() {
+ if (mimeTypes != null) return mimeTypes;
+
+ mimeTypes = getTikaConfig().getMimeRepository();
+ return mimeTypes;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,110 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.sqlite.SQLiteConfig;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class SQLite3DBParser extends AbstractDBParser {
+
+ protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+ /**
+ * @param context context
+ * @return null (always)
+ */
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return null;
+ }
+
+ @Override
+ protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException {
+ String connectionString = getConnectionString(stream, metadata, context);
+
+ Connection connection = null;
+ try {
+ Class.forName(getJDBCClassName());
+ } catch (ClassNotFoundException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ try {
+ SQLiteConfig config = new SQLiteConfig();
+
+ //good habit, but effectively meaningless here
+ config.setReadOnly(true);
+ connection = config.createConnection(connectionString);
+
+ } catch (SQLException e) {
+ throw new IOException(e.getMessage());
+ }
+ return connection;
+ }
+
+ @Override
+ protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) throws IOException {
+ File dbFile = TikaInputStream.get(is).getFile();
+ return "jdbc:sqlite:" + dbFile.getAbsolutePath();
+ }
+
+ @Override
+ protected String getJDBCClassName() {
+ return SQLITE_CLASS_NAME;
+ }
+
+ @Override
+ protected List<String> getTableNames(Connection connection, Metadata metadata,
+ ParseContext context) throws SQLException {
+ List<String> tableNames = new LinkedList<String>();
+
+ try (Statement st = connection.createStatement()) {
+ String sql = "SELECT name FROM sqlite_master WHERE type='table'";
+ ResultSet rs = st.executeQuery(sql);
+
+ while (rs.next()) {
+ tableNames.add(rs.getString(1));
+ }
+ }
+ return tableNames;
+ }
+
+ @Override
+ public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) {
+ return new SQLite3TableReader(connection, tableName, context);
+ }
+}