You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/04/11 13:47:47 UTC

svn commit: r647128 - in /jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor: MsExcelTextExtractor.java MsWordTextExtractor.java

Author: mreutegg
Date: Fri Apr 11 04:47:42 2008
New Revision: 647128

URL: http://svn.apache.org/viewvc?rev=647128&view=rev
Log:
JCR-1534: Use POIExtractor wherever possible

Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?rev=647128&r1=647127&r2=647128&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java Fri Apr 11 04:47:42 2008
@@ -17,20 +17,14 @@
 package org.apache.jackrabbit.extractor;
 
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.hssf.usermodel.HSSFSheet;
-import org.apache.poi.hssf.usermodel.HSSFRow;
-import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.Reader;
 import java.io.InputStream;
 import java.io.IOException;
-import java.io.CharArrayWriter;
-import java.io.CharArrayReader;
 import java.io.StringReader;
-import java.util.Iterator;
 
 /**
  * Text extractor for Microsoft Excel sheets.
@@ -65,40 +59,9 @@
     public Reader extractText(InputStream stream,
                               String type,
                               String encoding) throws IOException {
-        CharArrayWriter writer = new CharArrayWriter();
         try {
             POIFSFileSystem fs = new POIFSFileSystem(stream);
-            HSSFWorkbook workbook = new HSSFWorkbook(fs);
-
-            for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
-                HSSFSheet sheet = workbook.getSheetAt(i);
-
-                Iterator rows = sheet.rowIterator();
-                while (rows.hasNext()) {
-                    HSSFRow row = (HSSFRow) rows.next();
-
-                    Iterator cells = row.cellIterator();
-                    while (cells.hasNext()) {
-                        HSSFCell cell = (HSSFCell) cells.next();
-                        switch (cell.getCellType()) {
-                        case HSSFCell.CELL_TYPE_NUMERIC:
-                            String num = Double.toString(cell.getNumericCellValue()).trim();
-                            if (num.length() > 0) {
-                                writer.write(num + " ");
-                            }
-                            break;
-                        case HSSFCell.CELL_TYPE_STRING:
-                            String text = cell.getStringCellValue().trim();
-                            if (text.length() > 0) {
-                                writer.write(text + " ");
-                            }
-                            break;
-                        }
-                    }
-                }
-            }
-
-            return new CharArrayReader(writer.toCharArray());
+            return new StringReader(new ExcelExtractor(fs).getText());
         } catch (RuntimeException e) {
             logger.warn("Failed to extract Excel text content", e);
             return new StringReader("");

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?rev=647128&r1=647127&r2=647128&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java Fri Apr 11 04:47:42 2008
@@ -18,7 +18,7 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
 
 import java.io.Reader;
 import java.io.InputStream;
@@ -40,7 +40,7 @@
      * Force loading of dependent class.
      */
     static {
-        HWPFDocument.class.getName();
+        WordExtractor.class.getName();
     }
 
     /**
@@ -61,9 +61,7 @@
                               String type,
                               String encoding) throws IOException {
         try {
-            HWPFDocument doc = new HWPFDocument(stream);
-            String text = doc.getRange().text();
-            return new StringReader(text);
+            return new StringReader(new WordExtractor(stream).getText());
         } catch (Exception e) {
             logger.warn("Failed to extract Word text content", e);
             return new StringReader("");