You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:10 UTC
svn commit: r762808 -
/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
Author: jukka
Date: Tue Apr 7 15:05:15 2009
New Revision: 762808
URL: http://svn.apache.org/viewvc?rev=762808&view=rev
Log:
JCR-1887: msoffice text extractor for office 2007 files
Replace the implementation with a Apache Tika from TIKA-1878. This way we won't get compile errors due to the Java 5 POI libraries.
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762808&r1=762807&r2=762808&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr 7 15:05:15 2009
@@ -16,67 +16,25 @@
*/
package org.apache.jackrabbit.extractor;
-import org.apache.poi.extractor.ExtractorFactory;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.StringReader;
/**
* Text extractor for Microsoft Word documents.
*/
-public class MsTextExtractor extends AbstractTextExtractor {
-
- /**
- * Logger instance.
- */
- private static final Logger logger =
- LoggerFactory.getLogger(MsTextExtractor.class);
-
- /**
- * Force loading of dependent class.
- */
- static {
- ExtractorFactory.class.getName();
- }
-
- /**
- * Creates a new <code>MsWordTextExtractor</code> instance.
- */
- public MsTextExtractor() {
- super(new String[]{"application/vnd.ms-word",
- "application/msword",
- "application/vnd.ms-powerpoint",
- "application/mspowerpoint",
- "application/vnd.ms-excel",
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"});
- }
+public class MsTextExtractor extends TikaTextExtractor {
- //-------------------------------------------------------< TextExtractor >
+ private static String[] TYPES = new String[] {
+ "application/vnd.ms-word",
+ "application/msword",
+ "application/vnd.ms-powerpoint",
+ "application/mspowerpoint",
+ "application/vnd.ms-excel",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+ };
- /**
- * {@inheritDoc}
- * Returns an empty reader if an error occured extracting text from
- * the word document.
- */
- public Reader extractText(InputStream stream,
- String type,
- String encoding) throws IOException {
- try {
- String text = ExtractorFactory.createExtractor(stream).getText();
- return new StringReader(text);
- } catch (Exception e) {
- logger.warn("Failed to extract Microsoft Document text content", e);
- return new StringReader("");
- } finally {
- stream.close();
- }
+ public String[] getContentTypes() {
+ return TYPES;
}
}