You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2008/04/11 16:37:25 UTC

svn commit: r647185 - in /jackrabbit/sandbox/jackrabbit-tika: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/jackrabbit/ src/main/java/org/apache/jackrabbit/tika/

Author: jukka
Date: Fri Apr 11 07:37:22 2008
New Revision: 647185

URL: http://svn.apache.org/viewvc?rev=647185&view=rev
Log:
jackrabbit-tika: Jackrabbit text extractor based on Apache Tika

Added:
    jackrabbit/sandbox/jackrabbit-tika/
    jackrabbit/sandbox/jackrabbit-tika/pom.xml
    jackrabbit/sandbox/jackrabbit-tika/src/
    jackrabbit/sandbox/jackrabbit-tika/src/main/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/
    jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java

Added: jackrabbit/sandbox/jackrabbit-tika/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/jackrabbit-tika/pom.xml?rev=647185&view=auto
==============================================================================
--- jackrabbit/sandbox/jackrabbit-tika/pom.xml (added)
+++ jackrabbit/sandbox/jackrabbit-tika/pom.xml Fri Apr 11 07:37:22 2008
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+                             http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.jackrabbit</groupId>
+  <artifactId>jackrabbit-tika</artifactId>
+  <version>SNAPSHOT</version>
+  <name>Jackrabbit Tika</name>
+  <description>Apache Tika text extractor for Apache Jackrabbit</description>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.jackrabbit</groupId>
+      <artifactId>jackrabbit-text-extractors</artifactId>
+      <version>1.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika</artifactId>
+      <version>0.2-SNAPSHOT</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>1.5</source>
+          <target>1.5</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>

Added: jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java?rev=647185&view=auto
==============================================================================
--- jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java (added)
+++ jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java Fri Apr 11 07:37:22 2008
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaTextExctractor implements TextExtractor {
+
+    private final AutoDetectParser parser = new AutoDetectParser();
+
+    public String[] getContentTypes() {
+        Set<String> types = parser.getParsers().keySet();
+        return types.toArray(new String[types.size()]);
+    }
+
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            if (type != null && type.trim().length() > 0) {
+                metadata.set(Metadata.CONTENT_TYPE, type.trim());
+            }
+            parser.parse(stream, handler, metadata);
+            return new StringReader(handler.toString());
+        } catch (SAXException e) {
+            // Should never happen
+            return new StringReader("");
+        } catch (TikaException e) {
+            return new StringReader("");
+        }
+    }
+
+}