You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:08 UTC

svn commit: r762802 - in /jackrabbit/trunk/jackrabbit-text-extractors: pom.xml src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java

Author: jukka
Date: Tue Apr  7 14:57:46 2009
New Revision: 762802

URL: http://svn.apache.org/viewvc?rev=762802&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Added a TikaTextExtractor class.

Replaced all existing direct parser dependencies with transitive dependencies from Tika.

Added:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java   (with props)
Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/pom.xml

Modified: jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/pom.xml?rev=762802&r1=762801&r2=762802&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/pom.xml Tue Apr  7 14:57:46 2009
@@ -62,28 +62,10 @@
 
   <dependencies>
     <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi</artifactId>
-      <version>3.5-beta5</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi-ooxml</artifactId>
-      <version>3.5-beta5</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.poi</groupId>
-      <artifactId>poi-scratchpad</artifactId>
-      <version>3.5-beta5</version>
-    </dependency>
-    <dependency>
-      <groupId>pdfbox</groupId>
-      <artifactId>pdfbox</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-      <version>1.9.7</version>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika</artifactId>
+      <version>0.3</version>
+      <classifier>jdk14</classifier>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>

Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java?rev=762802&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java Tue Apr  7 14:57:46 2009
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParsingReader;
+
+public class TikaTextExtractor implements TextExtractor {
+
+    private final AutoDetectParser parser = new AutoDetectParser();
+
+    public String[] getContentTypes() {
+        Set types = parser.getParsers().keySet();
+        return (String[]) types.toArray(new String[types.size()]);
+    }
+
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        Metadata metadata = new Metadata();
+        if (type != null && type.trim().length() > 0) {
+            metadata.set(Metadata.CONTENT_TYPE, type.trim());
+        }
+        return new ParsingReader(parser, stream, metadata);
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native