You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:16 UTC
svn commit: r762823 - in
/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor:
MsExcelTextExtractor.java MsOutlookTextExtractor.java
MsPowerPointTextExtractor.java MsWordTextExtractor.java
Author: jukka
Date: Tue Apr 7 15:54:53 2009
New Revision: 762823
URL: http://svn.apache.org/viewvc?rev=762823&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction
Use POI through Tika to avoid the Java 5 compilation errors.
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?rev=762823&r1=762822&r2=762823&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java Tue Apr 7 15:54:53 2009
@@ -16,61 +16,19 @@
*/
package org.apache.jackrabbit.extractor;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.StringReader;
-
/**
* Text extractor for Microsoft Excel sheets.
*/
-public class MsExcelTextExtractor extends AbstractTextExtractor {
-
- /**
- * Logger instance.
- */
- private static final Logger logger =
- LoggerFactory.getLogger(MsExcelTextExtractor.class);
+public class MsExcelTextExtractor extends DefaultTextExtractor {
- /**
- * Force loading of dependent class.
- */
- static {
- POIFSFileSystem.class.getName();
- }
+ private static String[] TYPES = new String[] {
+ "application/vnd.ms-excel",
+ "application/msexcel",
+ "application/excel"
+ };
- /**
- * Creates a new <code>MsExcelTextExtractor</code> instance.
- */
- public MsExcelTextExtractor() {
- super(new String[] {
- "application/vnd.ms-excel",
- "application/msexcel",
- "application/excel"
- });
+ public String[] getContentTypes() {
+ return TYPES;
}
- //-------------------------------------------------------< TextExtractor >
-
- /**
- * {@inheritDoc}
- */
- public Reader extractText(InputStream stream,
- String type,
- String encoding) throws IOException {
- try {
- POIFSFileSystem fs = new POIFSFileSystem(stream);
- return new StringReader(new ExcelExtractor(fs).getText());
- } catch (RuntimeException e) {
- logger.warn("Failed to extract Excel text content", e);
- return new StringReader("");
- } finally {
- stream.close();
- }
- }
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java?rev=762823&r1=762822&r2=762823&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java Tue Apr 7 15:54:53 2009
@@ -16,64 +16,17 @@
*/
package org.apache.jackrabbit.extractor;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.poi.hsmf.MAPIMessage;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
/**
* Text extractor for Microsoft Outlook messages.
*/
-public class MsOutlookTextExtractor extends AbstractTextExtractor {
-
- /**
- * Logger instance.
- */
- private static final Logger logger =
- LoggerFactory.getLogger(MsOutlookTextExtractor.class);
-
- /**
- * Force loading of dependent class.
- */
- static {
- MAPIMessage.class.getName();
- }
-
- /**
- * Creates a new <code>MsOutlookTextExtractor</code> instance.
- */
- public MsOutlookTextExtractor() {
- super(new String[]{"application/vnd.ms-outlook"});
- }
+public class MsOutlookTextExtractor extends DefaultTextExtractor {
- //-------------------------------------------------------< TextExtractor >
+ private static String[] TYPES = new String[] {
+ "application/vnd.ms-outlook"
+ };
- /**
- * {@inheritDoc}
- * Returns an empty reader if an error occured extracting text from
- * the outlook message.
- */
- public Reader extractText(InputStream stream,
- String type,
- String encoding) throws IOException {
- try {
- MAPIMessage message = new MAPIMessage(stream);
- StringBuffer buffer = new StringBuffer();
- buffer.append(message.getDisplayFrom()).append('\n');
- buffer.append(message.getDisplayTo()).append('\n');
- buffer.append(message.getSubject()).append('\n');
- buffer.append(message.getTextBody());
- return new StringReader(buffer.toString());
- } catch (Exception e) {
- logger.warn("Failed to extract Message content", e);
- return new StringReader("");
- } finally {
- stream.close();
- }
+ public String[] getContentTypes() {
+ return TYPES;
}
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?rev=762823&r1=762822&r2=762823&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java Tue Apr 7 15:54:53 2009
@@ -16,64 +16,19 @@
*/
package org.apache.jackrabbit.extractor;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.StringReader;
-
/**
* Text extractor for Microsoft PowerPoint presentations.
*/
-public class MsPowerPointTextExtractor extends AbstractTextExtractor {
-
- /**
- * Logger instance.
- */
- private static final Logger logger =
- LoggerFactory.getLogger(MsPowerPointTextExtractor.class);
+public class MsPowerPointTextExtractor extends DefaultTextExtractor {
- /**
- * Force loading of dependent class.
- */
- static {
- POIFSReader.class.getName();
- }
+ private static String[] TYPES = new String[] {
+ "application/vnd.ms-powerpoint",
+ "application/mspowerpoint",
+ "application/powerpoint"
+ };
- /**
- * Creates a new <code>MsPowerPointTextExtractor</code> instance.
- */
- public MsPowerPointTextExtractor() {
- super(new String[]{
- "application/vnd.ms-powerpoint",
- "application/mspowerpoint",
- "application/powerpoint"
- });
+ public String[] getContentTypes() {
+ return TYPES;
}
- //-------------------------------------------------------< TextExtractor >
-
- /**
- * {@inheritDoc}
- */
- public Reader extractText(InputStream stream,
- String type,
- String encoding) throws IOException {
- try {
- PowerPointExtractor extractor = new PowerPointExtractor(stream);
- return new StringReader(extractor.getText(true, true));
- } catch (RuntimeException e) {
- logger.warn("Failed to extract PowerPoint text content", e);
- return new StringReader("");
- } finally {
- try {
- stream.close();
- } catch (IOException ignored) {
- }
- }
- }
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?rev=762823&r1=762822&r2=762823&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java Tue Apr 7 15:54:53 2009
@@ -16,58 +16,18 @@
*/
package org.apache.jackrabbit.extractor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.poi.hwpf.extractor.WordExtractor;
-
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.StringReader;
-
/**
* Text extractor for Microsoft Word documents.
*/
-public class MsWordTextExtractor extends AbstractTextExtractor {
-
- /**
- * Logger instance.
- */
- private static final Logger logger =
- LoggerFactory.getLogger(MsWordTextExtractor.class);
-
- /**
- * Force loading of dependent class.
- */
- static {
- WordExtractor.class.getName();
- }
-
- /**
- * Creates a new <code>MsWordTextExtractor</code> instance.
- */
- public MsWordTextExtractor() {
- super(new String[]{"application/vnd.ms-word", "application/msword"});
- }
+public class MsWordTextExtractor extends DefaultTextExtractor {
- //-------------------------------------------------------< TextExtractor >
+ private static String[] TYPES = new String[] {
+ "application/vnd.ms-word",
+ "application/msword"
+ };
- /**
- * {@inheritDoc}
- * Returns an empty reader if an error occured extracting text from
- * the word document.
- */
- public Reader extractText(InputStream stream,
- String type,
- String encoding) throws IOException {
- try {
- return new StringReader(new WordExtractor(stream).getText());
- } catch (Exception e) {
- logger.warn("Failed to extract Word text content", e);
- return new StringReader("");
- } finally {
- stream.close();
- }
+ public String[] getContentTypes() {
+ return TYPES;
}
}