You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/04/11 13:18:19 UTC
svn commit: r647114 -
/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
Author: mreutegg
Date: Fri Apr 11 04:18:06 2008
New Revision: 647114
URL: http://svn.apache.org/viewvc?rev=647114&view=rev
Log:
JSR-1530: MsPowerPointTextExtractor does not extract from PPTs with € sign
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?rev=647114&r1=647113&r2=647114&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java Fri Apr 11 04:18:06 2008
@@ -17,20 +17,13 @@
package org.apache.jackrabbit.extractor;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Reader;
import java.io.InputStream;
import java.io.IOException;
-import java.io.OutputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.InputStreamReader;
-import java.io.ByteArrayInputStream;
import java.io.StringReader;
/**
@@ -68,51 +61,15 @@
String type,
String encoding) throws IOException {
try {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- MsPowerPointListener listener = new MsPowerPointListener(baos);
- POIFSReader reader = new POIFSReader();
- reader.registerListener(listener);
- reader.read(stream);
- return new InputStreamReader(
- new ByteArrayInputStream(baos.toByteArray()));
+ PowerPointExtractor extractor = new PowerPointExtractor(stream);
+ return new StringReader(extractor.getText(true, true));
} catch (RuntimeException e) {
logger.warn("Failed to extract PowerPoint text content", e);
return new StringReader("");
} finally {
- stream.close();
- }
- }
-
- //------------------------------------------------< MsPowerPointListener >
-
- /**
- * Reader listener.
- */
- private class MsPowerPointListener implements POIFSReaderListener {
- private OutputStream os;
-
- MsPowerPointListener(OutputStream os) {
- this.os = os;
- }
-
- public void processPOIFSReaderEvent(POIFSReaderEvent event) {
try {
- if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
- return;
- }
- DocumentInputStream input = event.getStream();
- byte[] buffer = new byte[input.available()];
- input.read(buffer, 0, input.available());
- for (int i = 0; i < buffer.length - 20; i++) {
- long type = LittleEndian.getUShort(buffer, i + 2);
- long size = LittleEndian.getUInt(buffer, i + 4);
- if (type == 4008) {
- os.write(buffer, i + 4 + 1, (int) size + 3);
- i = i + 4 + 1 + (int) size - 1;
- }
- }
- } catch (Exception e) {
-
+ stream.close();
+ } catch (IOException ignored) {
}
}
}