You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:11 UTC
svn commit: r762814 - in
/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor:
DefaultTextExtractor.java MsTextExtractor.java TikaTextExtractor.java
Author: jukka
Date: Tue Apr 7 15:21:51 2009
New Revision: 762814
URL: http://svn.apache.org/viewvc?rev=762814&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction
Make TikaTextExtractor into the DefaultTextExtractor.
Removed:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=762814&r1=762813&r2=762814&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Tue Apr 7 15:21:51 2009
@@ -16,11 +16,50 @@
*/
package org.apache.jackrabbit.extractor;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+
/**
- * Composite text extractor that by default contains the standard
- * text extractors found in this package.
- *
- * @deprecated Use {@link TikaTextExtractor} instead
+ * Default text extractor based on Apache Tika.
*/
-public class DefaultTextExtractor extends TikaTextExtractor {
+public class DefaultTextExtractor implements TextExtractor {
+
+ /**
+ * Auto-detecting parser.
+ */
+ private static final Parser PARSER;
+
+ /**
+ * Supported content types.
+ */
+ private static final String[] TYPES;
+
+ static {
+ AutoDetectParser parser = new AutoDetectParser();
+ PARSER = parser;
+ Set types = parser.getParsers().keySet();
+ TYPES = (String[]) types.toArray(new String[types.size()]);
+ }
+
+ public String[] getContentTypes() {
+ return TYPES;
+ }
+
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ Metadata metadata = new Metadata();
+ if (type != null && type.trim().length() > 0) {
+ metadata.set(Metadata.CONTENT_TYPE, type.trim());
+ }
+ // TODO: This creates a background thread. Is that a problem?
+ return new ParsingReader(PARSER, stream, metadata);
+ }
+
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762814&r1=762813&r2=762814&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr 7 15:21:51 2009
@@ -20,7 +20,7 @@
/**
* Text extractor for Microsoft Word documents.
*/
-public class MsTextExtractor extends TikaTextExtractor {
+public class MsTextExtractor extends DefaultTextExtractor {
private static String[] TYPES = new String[] {
"application/vnd.ms-word",