You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2013/09/26 16:01:18 UTC

svn commit: r1526498 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/resources/test-documents/

Author: tallison
Date: Thu Sep 26 14:01:17 2013
New Revision: 1526498

URL: http://svn.apache.org/r1526498
Log:
tika-1100 textboxes in xlsx; modified XSSFExcelExtractorDecorator and added test in OOXMLParserTest

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1526498&r1=1526497&r2=1526498&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Thu Sep 26 14:01:17 2013
@@ -46,6 +46,8 @@ import org.apache.poi.xssf.model.Comment
 import org.apache.poi.xssf.model.StylesTable;
 import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -118,6 +120,7 @@ public class XSSFExcelExtractorDecorator
        while (iter.hasNext()) {
            InputStream stream = iter.next();
            sheetParts.add(iter.getSheetPart());
+           
            SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml, iter.getSheetComments());
 
            // Start, and output the sheet name
@@ -142,7 +145,7 @@ public class XSSFExcelExtractorDecorator
            for(String footer : sheetExtractor.footers) {
               extractHeaderFooter(footer, xhtml);
            }
-           
+           processShapes(iter.getShapes(), xhtml);
            // All done with this sheet
            xhtml.endElement("div");
        }
@@ -157,6 +160,20 @@ public class XSSFExcelExtractorDecorator
         }
     }
     
+    private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
+       if (shapes == null){
+           return;
+       }
+       for (XSSFShape shape : shapes){
+           if (shape instanceof XSSFSimpleShape){
+               String sText = ((XSSFSimpleShape)shape).getText();
+               if (sText != null && sText.length() > 0){
+                   xhtml.element("p", sText);
+               }
+           }
+       }
+   }
+    
     public void processSheet(
           SheetContentsHandler sheetContentsExtractor,
           StylesTable styles,

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1526498&r1=1526497&r2=1526498&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Sep 26 14:01:17 2013
@@ -988,4 +988,16 @@ public class OOXMLParserTest extends Tik
             input.close();
         }
     }
+
+    //TIKA-1100:
+    public void testExcelTextBox() throws Exception {
+        Metadata metadata = new Metadata(); 
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+        InputStream input = getTestDocument("testEXCEL_textbox.xlsx");
+        parser.parse(input, handler, metadata, context);
+        String content = handler.toString();
+        assertContains("some autoshape", content);    
+    }    
+
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx?rev=1526498&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream