You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:08 UTC
svn commit: r762802 - in /jackrabbit/trunk/jackrabbit-text-extractors:
pom.xml src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
Author: jukka
Date: Tue Apr 7 14:57:46 2009
New Revision: 762802
URL: http://svn.apache.org/viewvc?rev=762802&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction
Added a TikaTextExtractor class.
Replaced all existing direct parser dependencies with transitive dependencies from Tika.
Added:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java (with props)
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
Modified: jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/pom.xml?rev=762802&r1=762801&r2=762802&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/pom.xml Tue Apr 7 14:57:46 2009
@@ -62,28 +62,10 @@
<dependencies>
<dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- <version>3.5-beta5</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>3.5-beta5</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>3.5-beta5</version>
- </dependency>
- <dependency>
- <groupId>pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- </dependency>
- <dependency>
- <groupId>net.sourceforge.nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>1.9.7</version>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika</artifactId>
+ <version>0.3</version>
+ <classifier>jdk14</classifier>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java?rev=762802&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java Tue Apr 7 14:57:46 2009
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.jackrabbit.extractor.TextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParsingReader;
+
+public class TikaTextExtractor implements TextExtractor {
+
+ private final AutoDetectParser parser = new AutoDetectParser();
+
+ public String[] getContentTypes() {
+ Set types = parser.getParsers().keySet();
+ return (String[]) types.toArray(new String[types.size()]);
+ }
+
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ Metadata metadata = new Metadata();
+ if (type != null && type.trim().length() > 0) {
+ metadata.set(Metadata.CONTENT_TYPE, type.trim());
+ }
+ return new ParsingReader(parser, stream, metadata);
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native