You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2008/04/11 16:37:25 UTC
svn commit: r647185 - in /jackrabbit/sandbox/jackrabbit-tika: ./ src/
src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/jackrabbit/
src/main/java/org/apache/jackrabbit/tika/
Author: jukka
Date: Fri Apr 11 07:37:22 2008
New Revision: 647185
URL: http://svn.apache.org/viewvc?rev=647185&view=rev
Log:
jackrabbit-tika: Jackrabbit text extractor based on Apache Tika
Added:
jackrabbit/sandbox/jackrabbit-tika/
jackrabbit/sandbox/jackrabbit-tika/pom.xml
jackrabbit/sandbox/jackrabbit-tika/src/
jackrabbit/sandbox/jackrabbit-tika/src/main/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/
jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java
Added: jackrabbit/sandbox/jackrabbit-tika/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/jackrabbit-tika/pom.xml?rev=647185&view=auto
==============================================================================
--- jackrabbit/sandbox/jackrabbit-tika/pom.xml (added)
+++ jackrabbit/sandbox/jackrabbit-tika/pom.xml Fri Apr 11 07:37:22 2008
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+ http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>jackrabbit-tika</artifactId>
+ <version>SNAPSHOT</version>
+ <name>Jackrabbit Tika</name>
+ <description>Apache Tika text extractor for Apache Jackrabbit</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>jackrabbit-text-extractors</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika</artifactId>
+ <version>0.2-SNAPSHOT</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.5</source>
+ <target>1.5</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
Added: jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java?rev=647185&view=auto
==============================================================================
--- jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java (added)
+++ jackrabbit/sandbox/jackrabbit-tika/src/main/java/org/apache/jackrabbit/tika/TikaTextExctractor.java Fri Apr 11 07:37:22 2008
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaTextExctractor implements TextExtractor {
+
+ private final AutoDetectParser parser = new AutoDetectParser();
+
+ public String[] getContentTypes() {
+ Set<String> types = parser.getParsers().keySet();
+ return types.toArray(new String[types.size()]);
+ }
+
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ if (type != null && type.trim().length() > 0) {
+ metadata.set(Metadata.CONTENT_TYPE, type.trim());
+ }
+ parser.parse(stream, handler, metadata);
+ return new StringReader(handler.toString());
+ } catch (SAXException e) {
+ // Should never happen
+ return new StringReader("");
+ } catch (TikaException e) {
+ return new StringReader("");
+ }
+ }
+
+}