You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/25 15:34:56 UTC

svn commit: r1001209 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/mai...

Author: nick
Date: Sat Sep 25 13:34:55 2010
New Revision: 1001209

URL: http://svn.apache.org/viewvc?rev=1001209&view=rev
Log:
Apply patch from TIKA-506 - Improve the html generated from .doc and .docx to include more things
This patch includes an upgrade to POI 3.7 beta 3
For .docx, we now return headers where appropriate, tables, hyperlinks, non standard styles as classes, and images in the correct place
For .doc, we also do headers, hyperlinks and non standard styles. Tables only work for 1st level ones, nested tables just come out as paragraphs for now
(Lists are not yet supported in either format, these appear as paragraphs)

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java Sat Sep 25 13:34:55 2010
@@ -71,15 +71,27 @@ public class EmbeddedDocumentExtractor {
         return true;
     }
 
+    /**
+     * Processes the supplied embedded resource, calling the delegating
+     *  parser with the appropriate details.
+     * @param stream The embedded resource
+     * @param handler The handler to use
+     * @param metadata The metadata for the embedded resource
+     * @param outputHtml Should we output HTML for this resource, or has the parser already done so?
+     * @throws SAXException
+     * @throws IOException
+     */
     public void parseEmbedded(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
             throws SAXException, IOException {
-        AttributesImpl attributes = new AttributesImpl();
-        attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
-        handler.startElement(XHTML, "div", "div", attributes);
+        if(outputHtml) {
+           AttributesImpl attributes = new AttributesImpl();
+           attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
+           handler.startElement(XHTML, "div", "div", attributes);
+        }
 
         String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name != null && name.length() > 0) {
+        if (name != null && name.length() > 0 && outputHtml) {
             handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
             char[] chars = name.toCharArray();
             handler.characters(chars, 0, chars.length);
@@ -96,7 +108,9 @@ public class EmbeddedDocumentExtractor {
             // Could not parse the entry, just skip the content
         }
 
-        handler.endElement(XHTML, "div", "div");
+        if(outputHtml) {
+           handler.endElement(XHTML, "div", "div");
+        }
     }
 
 }

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Sat Sep 25 13:34:55 2010
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>3.7-beta2</poi.version>
+    <poi.version>3.7-beta3</poi.version>
   </properties>
 
   <repositories>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Sat Sep 25 13:34:55 2010
@@ -46,8 +46,8 @@ abstract class AbstractPOIFSExtractor {
         this.extractor = new EmbeddedDocumentExtractor(context);
     }
     
-    protected void handleEmbeddedResource(TikaInputStream resource,
-          String filename, String mediaType, XHTMLContentHandler xhtml)
+    protected void handleEmbeddedResource(TikaInputStream resource, String filename,
+          String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
           throws IOException, SAXException, TikaException {
        try {
            Metadata metadata = new Metadata();
@@ -60,7 +60,7 @@ abstract class AbstractPOIFSExtractor {
            }
 
            if (extractor.shouldParseEmbedded(metadata)) {
-               extractor.parseEmbedded(resource, xhtml, metadata);
+               extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
            }
        } finally {
            resource.close();
@@ -83,7 +83,7 @@ abstract class AbstractPOIFSExtractor {
           );
           ZipContainerDetector detector = new ZipContainerDetector();
           MediaType type = detector.detect(ooxmlStream, new Metadata());
-          handleEmbeddedResource(ooxmlStream, null, type.toString(), xhtml);
+          handleEmbeddedResource(ooxmlStream, null, type.toString(), xhtml, true);
           return;
        } catch(FileNotFoundException e) {
           // It's regular OLE2
@@ -109,7 +109,7 @@ abstract class AbstractPOIFSExtractor {
            TikaInputStream embedded = TikaInputStream.get(tmpFile);
            try {
                if (extractor.shouldParseEmbedded(metadata)) {
-                   extractor.parseEmbedded(embedded, xhtml, metadata);
+                   extractor.parseEmbedded(embedded, xhtml, metadata, true);
                }
            } finally {
                embedded.close();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sat Sep 25 13:34:55 2010
@@ -56,7 +56,6 @@ import org.apache.poi.hssf.record.SSTRec
 import org.apache.poi.hssf.record.TextObjectRecord;
 import org.apache.poi.hssf.record.chart.SeriesTextRecord;
 import org.apache.poi.hssf.record.common.UnicodeString;
-import org.apache.poi.hssf.usermodel.HSSFPictureData;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
@@ -564,7 +563,7 @@ public class ExcelExtractor extends Abst
                     // Handle the embeded resource
                     extractor.handleEmbeddedResource(
                           stream, null, mimeType,
-                          handler
+                          handler, true
                     );
                  }
               }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sat Sep 25 13:34:55 2010
@@ -127,7 +127,7 @@ public class OutlookExtractor extends Ab
                   handleEmbeddedResource(
                         TikaInputStream.get(attachment.attachData.getValue()),
                         filename,
-                        null, xhtml
+                        null, xhtml, true
                   );
                }
                if(attachment.attachmentDirectory != null) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Sat Sep 25 13:34:55 2010
@@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 
 import org.apache.poi.hwpf.HWPFDocument;
@@ -25,7 +27,14 @@ import org.apache.poi.hwpf.HWPFOldDocume
 import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.StyleDescription;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -56,10 +65,22 @@ public class WordExtractor extends Abstr
 
         addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());
 
-        for (String paragraph : wordExtractor.getParagraphText()) {
-            xhtml.element("p", paragraph);
+        // Grab the list of pictures. As far as we can tell,
+        //  the pictures should be in order, and may be directly
+        //  placed or referenced from an anchor
+        PicturesTable pictureTable = document.getPicturesTable();
+        CountingIterator<Picture> pictures = new CountingIterator<Picture>(
+              pictureTable.getAllPictures().iterator()
+        );
+        
+        // Do the main paragraph text
+        Range r = document.getRange();
+        for(int i=0; i<r.numParagraphs(); i++) {
+           Paragraph p = r.getParagraph(i);
+           i += handleParagraph(p, 0, r, document, pictures, pictureTable, xhtml);
         }
 
+        // Do everything else
         for (String paragraph : wordExtractor.getFootnoteText()) {
             xhtml.element("p", paragraph);
         }
@@ -74,42 +95,12 @@ public class WordExtractor extends Abstr
 
         addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());
 
-        // Handle any embeded images
-        PicturesTable pictureTable = document.getPicturesTable();
-        if(pictureTable != null) {
-           List<Picture> pictures = (List<Picture>)pictureTable.getAllPictures(); // TODO Generics fixed in newer version
-           for(Picture picture : pictures) {
-              // TODO When we have upgraded POI, we can use this code instead
-              //String mimeType = picture.getMimeType();
-              
-              // This code is cut'n'paste from a newer version of POI
-              String mimeType = "image/unknown";
-              String extension = picture.suggestFileExtension();
-              if("jpg".equals(extension)) {
-                 mimeType =  "image/jpeg";
-              }
-              if("png".equals(extension)) {
-                 mimeType =  "image/png";
-              }
-              if("gif".equals(extension)) {
-                 mimeType =  "image/gif";
-              }
-              if("bmp".equals(extension)) {
-                 mimeType =  "image/bmp";
-              }
-              if("tiff".equals(extension)) {
-                 mimeType =  "image/tiff";
-              }
-              if("wmf".equals(extension)) {
-                 mimeType =  "image/x-wmf";
-              }
-              if("emf".equals(extension)) {
-                 mimeType =  "image/x-emf";
-              }
-              
-              TikaInputStream stream = TikaInputStream.get(picture.getContent());
-              handleEmbeddedResource(stream, null, mimeType, xhtml);
-           }
+        // Handle any pictures that we haven't output yet
+        while(pictures.hasNext()) {
+           Picture p = pictures.next();
+           handlePictureCharacterRun(
+                 null, p, pictures.getCount(), xhtml
+           );
         }
         
         // Handle any embeded office documents
@@ -125,7 +116,189 @@ public class WordExtractor extends Abstr
         } catch(FileNotFoundException e) {
         }
     }
+    
+    private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, 
+          CountingIterator<Picture> pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
+          throws SAXException, IOException, TikaException {
+       // Note - a poi bug means we can't currently properly recurse
+       //  into nested tables, so currently we don't
+       if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) {
+          Table t = r.getTable(p);
+          xhtml.startElement("table");
+          xhtml.startElement("tbody");
+          for(int rn=0; rn<t.numRows(); rn++) {
+             TableRow row = t.getRow(rn);
+             xhtml.startElement("tr");
+             for(int cn=0; cn<row.numCells(); cn++) {
+                TableCell cell = row.getCell(cn);
+                xhtml.startElement("td");
+
+                for(int pn=0; pn<cell.numParagraphs(); pn++) {
+                   Paragraph cellP = cell.getParagraph(pn);
+                   handleParagraph(cellP, p.getTableLevel(), cell, document, pictures, pictureTable, xhtml);
+                }
+                xhtml.endElement("td");
+             }
+             xhtml.endElement("tr");
+          }
+          return (t.numParagraphs()-1);
+       }
+
+       StyleDescription style = 
+          document.getStyleSheet().getStyleDescription(p.getStyleIndex());
+       TagAndStyle tas = buildParagraphTagAndStyle(
+             style.getName(), (parentTableLevel>0)
+       );
+
+       if(tas.getStyleClass() != null) {
+          xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
+       } else {
+          xhtml.startElement(tas.getTag());
+       }
+       
+       for(int j=0; j<p.numCharacterRuns(); j++) {
+          CharacterRun cr = p.getCharacterRun(j);
+          
+          if(cr.text().equals("\u0013")) {
+             j += handleSpecialCharacterRuns(p, j, tas.isHeading(), xhtml);
+          } else if(cr.text().equals("\u0008")) {
+             // Floating Picture
+             Picture picture = pictures.next();
+             handlePictureCharacterRun(cr, picture, pictures.getCount(), xhtml);
+          } else if(pictureTable.hasPicture(cr)) {
+             // Inline Picture
+             Picture picture = pictures.next();
+             handlePictureCharacterRun(cr, picture, pictures.getCount(), xhtml);
+          } else {
+             handleCharacterRun(cr, tas.isHeading(), xhtml);
+          }
+       }
+       
+       xhtml.endElement(tas.getTag());
+       
+       return 0;
+    }
+    
+    private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) 
+          throws SAXException {
+       // Skip trailing newlines
+       if(cr.text().equals("\r"))
+          return;
+       
+       List<String> tags = new ArrayList<String>();
+       if(!skipStyling) {
+          if(cr.isBold()) tags.add("b");
+          if(cr.isItalic()) tags.add("i");
+          if(cr.isStrikeThrough()) tags.add("s");
+          for(String tag : tags) {
+             xhtml.startElement(tag);
+          }
+       }
+       
+       // Clean up the text
+       String text = cr.text();
+       text = text.replace('\r', '\n');
+       if(text.endsWith("\u0007")) {
+          // Strip the table cell end marker
+          text = text.substring(0, text.length()-1);
+       }
+       
+       xhtml.characters(text);
+
+       for(int tn=tags.size()-1; tn>=0; tn--) {
+          xhtml.endElement(tags.get(tn));
+       }
+    }
+    /**
+     * Can be \13..text..\15 or \13..control..\14..text..\15 .
+     * Nesting is allowed
+     */
+    private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, XHTMLContentHandler xhtml) 
+          throws SAXException {
+       List<CharacterRun> controls = new ArrayList<CharacterRun>();
+       List<CharacterRun> texts = new ArrayList<CharacterRun>();
+       boolean has14 = false;
+       
+       // Split it into before and after the 14
+       int i;
+       for(i=index; i<p.numCharacterRuns(); i++) {
+          CharacterRun cr = p.getCharacterRun(i);
+          if(cr.text().equals("\u0013")) {
+             // Nested, oh joy...
+             int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, xhtml);
+             i += increment;
+          } else if(cr.text().equals("\u0014")) {
+             has14 = true;
+          } else if(cr.text().equals("\u0015")) {
+             if(!has14) {
+                texts = controls;
+                controls = new ArrayList<CharacterRun>();
+             }
+             break;
+          } else {
+             if(has14) {
+                texts.add(cr);
+             } else {
+                controls.add(cr);
+             }
+          }
+       }
+       
+       // Do we need to do something special with this?
+       if(controls.size() > 0) {
+          String text = controls.get(0).text();
+          for(int j=1; j<controls.size(); j++) {
+             text += controls.get(j).text();
+          }
+          
+          if(text.startsWith("HYPERLINK") && text.indexOf('"') > -1) {
+             String url = text.substring(
+                   text.indexOf('"') + 1,
+                   text.lastIndexOf('"')
+             );
+             xhtml.startElement("a", "href", url);
+             for(CharacterRun cr : texts) {
+                handleCharacterRun(cr, skipStyling, xhtml);
+             }
+             xhtml.endElement("a");
+          } else {
+             // Just output the text ones
+             for(CharacterRun cr : texts) {
+                handleCharacterRun(cr, skipStyling, xhtml);
+             }
+          }
+       } else {
+          // We only had text
+          // Output as-is
+          for(CharacterRun cr : texts) {
+             handleCharacterRun(cr, skipStyling, xhtml);
+          }
+       }
+       
+       // Tell them how many to skip over
+       return i-index;
+    }
 
+    private void handlePictureCharacterRun(CharacterRun cr, Picture picture, int pictureNumber, XHTMLContentHandler xhtml) 
+          throws SAXException, IOException, TikaException {
+       String extension = picture.suggestFileExtension();
+       
+       // Make up a name for the picture
+       // There isn't one in the file, but we need to be able to reference
+       //  the picture from the img tag and the embedded resource
+       String filename = "image"+pictureNumber+(extension.length()>0 ? "."+extension : "");
+       
+       // Grab the mime type for the picture
+       String mimeType = picture.getMimeType();
+       
+       // Output the img tag
+       xhtml.startElement("img", "src", "embedded:" + filename);
+       xhtml.endElement("img");
+       
+       TikaInputStream stream = TikaInputStream.get(picture.getContent());
+       handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
+    }
+    
     /**
      * Outputs a section of text if the given text is non-empty.
      *
@@ -154,4 +327,84 @@ public class WordExtractor extends Abstr
             xhtml.element("p", p);
         }
     }
+    
+    /**
+     * Given a style name, return what tag should be used, and
+     *  what style should be applied to it. 
+     */
+    public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
+       String tag = "p";
+       String styleClass = null;
+       
+       if(styleName.equals("Default") || styleName.equals("Normal")) {
+          // Already setup
+       } else if(styleName.equals("Table Contents") && isTable) {
+          // Already setup
+       } else if(styleName.equals("Heading")) {
+          tag = "h1";
+       } else if(styleName.startsWith("Heading ")) {
+          int num = 1;
+          try {
+             num = Integer.parseInt( 
+                   styleName.substring(styleName.length()-1)
+             );
+          } catch(NumberFormatException e) {}
+          tag = "h"+num;
+       } else if(styleName.equals("Title")) {
+          tag = "h1";
+          styleClass = "title";
+       } else if(styleName.equals("Subtitle")) {
+          tag = "h2";
+          styleClass = "subtitle";
+       } else {
+          styleClass = styleName.replace(' ', '_');
+          styleClass = styleClass.substring(0,1).toLowerCase() +
+                         styleClass.substring(1);
+       }
+       
+       return new TagAndStyle(tag,styleClass);
+    }
+    
+    public static class TagAndStyle {
+       private String tag;
+       private String styleClass;
+       public TagAndStyle(String tag, String styleClass) {
+         this.tag = tag;
+         this.styleClass = styleClass;
+       }
+       public String getTag() {
+         return tag;
+       }
+       public String getStyleClass() {
+         return styleClass;
+       }
+       public boolean isHeading() {
+          return tag.length()==2 && tag.startsWith("h");
+       }
+    }
+    
+    private static class CountingIterator<T> implements Iterator<T> {
+      private Iterator<T> parent;
+      private int count = 0;
+      private CountingIterator(Iterator<T> parent) {
+         this.parent = parent;
+      }
+
+      public boolean hasNext() {
+         return parent.hasNext();
+      }
+
+      public T next() {
+         count++;
+         return parent.next();
+      }
+      
+      public int getCount() {
+         return count;
+      }
+
+      public void remove() {
+         throw new UnsupportedOperationException();
+      }
+    }
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Sep 25 13:34:55 2010
@@ -136,7 +136,7 @@ public abstract class AbstractOOXMLExtra
        
        // Call the recursing handler
        Metadata metadata = new Metadata();
-       metadata.set(Metadata.TIKA_MIME_FILE, name);
+       metadata.set(Metadata.RESOURCE_NAME_KEY, name);
        metadata.set(Metadata.CONTENT_TYPE, type);
        
        Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Sat Sep 25 13:34:55 2010
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
 import java.io.IOException;
 
 import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Sat Sep 25 13:34:55 2010
@@ -21,7 +21,6 @@ import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
-import java.util.Locale;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sat Sep 25 13:34:55 2010
@@ -29,11 +29,7 @@ import org.apache.poi.openxml4j.opc.Targ
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
-import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Sat Sep 25 13:34:55 2010
@@ -40,10 +40,10 @@ import org.apache.poi.xssf.usermodel.XSS
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.exception.TikaException;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.SAXException;
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sat Sep 25 13:34:55 2010
@@ -18,31 +18,44 @@ package org.apache.tika.parser.microsoft
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.List;
 
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
-import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
-import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
+import org.apache.poi.xwpf.usermodel.BodyType;
+import org.apache.poi.xwpf.usermodel.IBody;
+import org.apache.poi.xwpf.usermodel.IBodyElement;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFPicture;
+import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.apache.poi.xwpf.usermodel.XWPFStyle;
+import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.poi.xwpf.usermodel.XWPFTable;
+import org.apache.poi.xwpf.usermodel.XWPFTableCell;
+import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
 import org.xml.sax.SAXException;
 
 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+    private XWPFDocument document;
+    private XWPFStyles styles;
 
     public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
         super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+        
+        document = (XWPFDocument) extractor.getDocument();
+        styles = document.getStyles();
     }
 
     /**
@@ -51,49 +64,140 @@ public class XWPFWordExtractorDecorator 
     @Override
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
-        XWPFDocument document = (XWPFDocument) extractor.getDocument();
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
 
         // headers
         extractHeaders(xhtml, hfPolicy);
 
-        // first all paragraphs
-        Iterator<XWPFParagraph> i = document.getParagraphsIterator();
-        while (i.hasNext()) {
-            XWPFParagraph paragraph = i.next();
-
-            CTSectPr ctSectPr = null;
-            if (paragraph.getCTP().getPPr() != null) {
-                ctSectPr = paragraph.getCTP().getPPr().getSectPr();
-            }
-
-            XWPFHeaderFooterPolicy headerFooterPolicy = null;
-
-            if (ctSectPr != null) {
-                headerFooterPolicy =
-                    new XWPFHeaderFooterPolicy(document, ctSectPr);
-                extractHeaders(xhtml, headerFooterPolicy);
-            }
-
-            XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
-                    new XWPFHyperlinkDecorator(paragraph, null, true));
-
-            for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartArray()) {
-                xhtml.element("p", bookmark.getName());
-            }
-
-            xhtml.element("p", decorator.getText());
-
-            if (ctSectPr != null) {
-                extractFooters(xhtml, headerFooterPolicy);
-            }
-        }
+        // process text in the order that it occurs in
+        extractIBodyText(document, xhtml);
 
         // then all document tables
-        extractTableContent(document, xhtml);
         extractFooters(xhtml, hfPolicy);
     }
 
+    private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+       for(IBodyElement element : bodyElement.getBodyElements()) {
+          if(element instanceof XWPFParagraph) {
+             XWPFParagraph paragraph = (XWPFParagraph)element;
+             extractParagraph(paragraph, xhtml);
+          }
+          if(element instanceof XWPFTable) {
+             XWPFTable table = (XWPFTable)element;
+             extractTable(table, xhtml);
+          }
+      }
+    }
+    
+    private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+       // If this paragraph is actually a whole new section, then
+       //  it could have its own headers and footers
+       // Check and handle if so
+       XWPFHeaderFooterPolicy headerFooterPolicy = null;
+       if (paragraph.getCTP().getPPr() != null) {
+           CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+           if(ctSectPr != null) {
+              headerFooterPolicy =
+                  new XWPFHeaderFooterPolicy(document, ctSectPr);
+              extractHeaders(xhtml, headerFooterPolicy);
+           }
+       }
+       
+       // Is this a paragraph, or a heading?
+       String tag = "p";
+       String styleClass = null;
+       if(paragraph.getStyleID() != null) {
+          XWPFStyle style = styles.getStyle(
+                paragraph.getStyleID()
+          );
+          
+          TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+                style.getName(), paragraph.getPartType() == BodyType.TABLECELL
+          );
+          tag = tas.getTag();
+          styleClass = tas.getStyleClass();
+       }
+       
+       for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
+           xhtml.element("p", bookmark.getName());
+       }
+
+       if(styleClass == null) {
+          xhtml.startElement(tag);
+       } else {
+          xhtml.startElement(tag, "class", styleClass);
+       }
+       
+       // Do the text
+       for(XWPFRun run : paragraph.getRuns()) {
+          List<String> tags = new ArrayList<String>();
+          if(run instanceof XWPFHyperlinkRun) {
+             XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
+             XWPFHyperlink link = linkRun.getHyperlink(document);
+             if(link != null && link.getURL() != null) {
+                xhtml.startElement("a", "href", link.getURL());
+                tags.add("a");
+             }
+          }
+          if(run.isBold()) {
+             xhtml.startElement("b");
+             tags.add("b");
+          }
+          if(run.isItalic()) {
+             xhtml.startElement("i");
+             tags.add("i");
+          }
+          
+          xhtml.characters(run.toString());
+          
+          for(int i=tags.size()-1; i>=0; i--) {
+             xhtml.endElement(tags.get(i));
+          }
+          
+          // If we have any pictures, output them
+          for(XWPFPicture picture : run.getEmbeddedPictures()) {
+             XWPFPictureData data = picture.getPictureData();
+             if(data != null) {
+                xhtml.startElement("img", "src", "embedded:" + data.getFileName());
+                xhtml.endElement("img");
+             }
+          }
+       }
+       
+       // Now do any comments for the paragraph
+       XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
+       String commentText = comments.getCommentText();
+       if(commentText != null && commentText.length() > 0) {
+          xhtml.characters(commentText);
+       }
+
+       // Finish this paragraph
+       xhtml.endElement(tag);
+
+       if (headerFooterPolicy != null) {
+           extractFooters(xhtml, headerFooterPolicy);
+       }
+    }
+
+    private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+       xhtml.startElement("table");
+       xhtml.startElement("tbody");
+       for(XWPFTableRow row : table.getRows()) {
+          xhtml.startElement("tr");
+          for(XWPFTableCell cell : row.getTableCells()) {
+             xhtml.startElement("td");
+             extractIBodyText(cell, xhtml);
+             xhtml.endElement("td");
+          }
+          xhtml.endElement("tr");
+       }
+       xhtml.endElement("tbody");
+       xhtml.endElement("table");
+    }
+    
     private void extractFooters(
             XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
             throws SAXException {
@@ -124,59 +228,13 @@ public class XWPFWordExtractorDecorator 
     }
 
     /**
-     * Low level structured parsing of document tables.
-     */
-    private void extractTableContent(XWPFDocument doc, XHTMLContentHandler xhtml)
-            throws SAXException {
-        for (CTTbl table : doc.getDocument().getBody().getTblArray()) {
-            xhtml.startElement("table");
-            xhtml.startElement("tbody");
-            CTRow[] rows = table.getTrArray();
-            for (CTRow row : rows) {
-                xhtml.startElement("tr");
-                CTTc[] cells = row.getTcArray();
-                for (CTTc tc : cells) {
-                    xhtml.startElement("td");
-                    CTP[] content = tc.getPArray();
-                    for (CTP ctp : content) {
-                        XWPFParagraph p = new MyXWPFParagraph(ctp, doc);
-
-                        XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
-                                new XWPFHyperlinkDecorator(p, null, true));
-
-                        xhtml.element("p", decorator.getText());
-                    }
-
-                    xhtml.endElement("td");
-                }
-                xhtml.endElement("tr");
-            }
-            xhtml.endElement("tbody");
-            xhtml.endElement("table");
-        }
-    }
-
-    /**
      * Word documents are simple, they only have the one
      *  main part
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-       XWPFDocument document = (XWPFDocument) extractor.getDocument();
-       
        List<PackagePart> parts = new ArrayList<PackagePart>();
        parts.add( document.getPackagePart() );
        return parts;
     }
-
-
-    /**
-     * Private wrapper class that makes the protected {@link XWPFParagraph}
-     * constructor available.
-     */
-    private static class MyXWPFParagraph extends XWPFParagraph {
-        private MyXWPFParagraph(CTP ctp, XWPFDocument xwpfDocument) {
-            super(ctp, xwpfDocument);
-        }
-    }
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java Sat Sep 25 13:34:55 2010
@@ -123,7 +123,7 @@ class PackageExtractor {
 
             // Use the delegate parser to parse the compressed document
             if (extractor.shouldParseEmbedded(entrydata)) {
-                extractor.parseEmbedded(stream, xhtml, entrydata);
+                extractor.parseEmbedded(stream, xhtml, entrydata, true);
             }
         } finally {
             stream.close();
@@ -155,7 +155,7 @@ class PackageExtractor {
                     }
 
                     if (extractor.shouldParseEmbedded(entrydata)) {
-                        extractor.parseEmbedded(archive, xhtml, entrydata);
+                        extractor.parseEmbedded(archive, xhtml, entrydata, true);
                     }
                 }
                 entry = archive.getNextEntry();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Sat Sep 25 13:34:55 2010
@@ -77,7 +77,7 @@ public class POIContainerExtractionTest 
        assertEquals(1, handler.filenames.size());
        assertEquals(1, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
+       assertEquals("image1.png", handler.filenames.get(0));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
 
        
@@ -86,9 +86,9 @@ public class POIContainerExtractionTest 
        assertEquals(3, handler.filenames.size());
        assertEquals(3, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
+       assertEquals("image1.png", handler.filenames.get(0));
+       assertEquals("image2.jpg", handler.filenames.get(1));
+       assertEquals("image3.png", handler.filenames.get(2));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
@@ -142,7 +142,7 @@ public class POIContainerExtractionTest 
        assertEquals(null, handler.filenames.get(2));
        assertEquals(null, handler.filenames.get(3));
        assertEquals(null, handler.filenames.get(4));
-       assertEquals(null, handler.filenames.get(5));
+       assertEquals("image1.png", handler.filenames.get(5));
        
        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
@@ -154,44 +154,57 @@ public class POIContainerExtractionTest 
        
        // Word with .docx, powerpoint and excel
        handler = process("testWORD_embeded.doc", extractor, false);
-       assertEquals(8, handler.filenames.size());
-       assertEquals(8, handler.mediaTypes.size());
+       assertEquals(9, handler.filenames.size());
+       assertEquals(9, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
+       // Filenames are a bit iffy...
+//       for(String filename : handler.filenames)
+//          assertEquals(null, filename);
        // But we do know their types
        assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embedded office doc?
        assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embedded office doc?
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(3)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embedded office doc
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(6)); // Embedded office doc
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(7)); // Embedded office doc
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(2)); // Icon of embedded office doc?
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded image
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
        
        
        // With recursion, should get their images too
        handler = process("testWORD_embeded.doc", extractor, true);
-       assertEquals(12, handler.filenames.size());
-       assertEquals(12, handler.mediaTypes.size());
+       assertEquals(13, handler.filenames.size());
+       assertEquals(13, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
+       // We don't know their filenames, except for doc images + docx
+       assertEquals("image1", handler.filenames.get(0));
+       assertEquals("image2", handler.filenames.get(1));
+       assertEquals("image3", handler.filenames.get(2));
+       assertEquals("image4.png", handler.filenames.get(3));
+       assertEquals("image5.jpg", handler.filenames.get(4));
+       assertEquals("image6.png", handler.filenames.get(5));
+       assertEquals(null, handler.filenames.get(6));
+       assertEquals("image2.png", handler.filenames.get(7));
+       assertEquals("image3.jpeg", handler.filenames.get(8));
+       assertEquals("image4.png", handler.filenames.get(9));
+       for(int i=10; i<handler.filenames.size(); i++) {
+          assertNull(handler.filenames.get(i));
+       }
        // But we do know their types
        assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embedded office doc?
        assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embedded office doc?
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));  // Embedded image
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(3));  // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(4));  // Embedded image
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embedded office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(6));  //    PNG inside .docx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(7));  //    JPG inside .docx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(8));  //    PNG inside .docx
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(9));  // Embedded office doc
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(10)); // Embedded office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); //    PNG inside .xls
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(2)); // Icon of embedded office doc?
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  // Embedded image
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  // Embedded image
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  // Embedded image
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside .docx
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(11)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); //    PNG inside .xls
        
        
        // PowerPoint with excel and word

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sat Sep 25 13:34:55 2010
@@ -17,9 +17,17 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
+import java.io.StringWriter;
 
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
@@ -45,6 +53,74 @@ public class WordParserTest extends Test
             input.close();
         }
     }
+    
+    /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    public void testWordHTML() throws Exception {
+        InputStream input = null;
+        Metadata metadata = new Metadata();
+        
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        // Try with a document containing various tables and formattings
+        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.doc");
+        try {
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+            String xml = sw.toString();
+            assertEquals(
+                  "application/msword",
+                  metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            assertTrue(xml.contains("Sample Word Document"));
+
+            // Check that custom headings came through
+            assertTrue(xml.contains("<h1 class=\"title\">"));
+            // Regular headings
+            assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+            assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+            // Bold and italic
+            assertTrue(xml.contains("<b>BOLD</b>"));
+            assertTrue(xml.contains("<i>ITALIC</i>"));
+            // Table
+            assertTrue(xml.contains("<table>"));
+            assertTrue(xml.contains("<td>"));
+            // TODO - Check for the nested table
+            // Links
+            assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+            // Paragraphs with other styles
+            assertTrue(xml.contains("<p class=\"signature\">This one"));
+        } finally {
+            input.close();
+        }
+        
+        // Try with a document that contains images
+        sw = new StringWriter();
+        handler.setResult(new StreamResult(sw));
+        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.doc");
+        try {
+            new OfficeParser().parse(TikaInputStream.get(input), handler, metadata, new ParseContext());
+            String xml = sw.toString();
+            
+            // Images 1-3
+            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image1.png\"/>"));
+            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.jpg\"/>"));
+            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.png\"/>"));
+            
+            // Text too
+            assertTrue(xml.contains("<p>The end!"));
+        } finally {
+            input.close();
+        }
+    }
 
     public void testWord6Parser() throws Exception {
         InputStream input = WordParserTest.class.getResourceAsStream(

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Sat Sep 25 13:34:55 2010
@@ -74,7 +74,7 @@ public class OOXMLContainerExtractionTes
        assertEquals(1, handler.filenames.size());
        assertEquals(1, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
+       assertEquals("image1.png", handler.filenames.get(0));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
 
        
@@ -84,9 +84,9 @@ public class OOXMLContainerExtractionTes
        assertEquals(3, handler.filenames.size());
        assertEquals(3, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
+       assertEquals("image1.png", handler.filenames.get(0));
+       assertEquals("image2.gif", handler.filenames.get(1));
+       assertEquals("image3.png", handler.filenames.get(2));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
        assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound
        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
@@ -97,7 +97,7 @@ public class OOXMLContainerExtractionTes
        assertEquals(1, handler.filenames.size());
        assertEquals(1, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
+       assertEquals("image1.png", handler.filenames.get(0));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
 
        
@@ -106,9 +106,9 @@ public class OOXMLContainerExtractionTes
        assertEquals(3, handler.filenames.size());
        assertEquals(3, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
+       assertEquals("image2.png", handler.filenames.get(0));
+       assertEquals("image3.jpeg", handler.filenames.get(1));
+       assertEquals("image4.png", handler.filenames.get(2));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
@@ -137,9 +137,14 @@ public class OOXMLContainerExtractionTes
        assertEquals(7, handler.filenames.size());
        assertEquals(7, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
+       // We know the rough filenames
+       assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", handler.filenames.get(0));
+       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(1));
+       assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(2));
+       assertEquals("image1.png", handler.filenames.get(3));
+       assertEquals("image2.emf", handler.filenames.get(4));
+       assertEquals("image3.emf", handler.filenames.get(5));
+       assertEquals("image4.emf", handler.filenames.get(6));
        // But we do know their types
        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
        assertEquals(TYPE_DOC, handler.mediaTypes.get(1));  // Embedded office doc
@@ -155,9 +160,6 @@ public class OOXMLContainerExtractionTes
        assertEquals(23, handler.filenames.size());
        assertEquals(23, handler.mediaTypes.size());
        
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
-       
        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
         assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside .pptx
         assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   PNG inside .pptx
@@ -188,9 +190,16 @@ public class OOXMLContainerExtractionTes
        assertEquals(9, handler.filenames.size());
        assertEquals(9, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
+       // We know their rough filenames
+       assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", handler.filenames.get(0));
+       assertEquals("image6.emf", handler.filenames.get(1));
+       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(2));
+       assertEquals("image1.png", handler.filenames.get(3));
+       assertEquals("image2.jpeg", handler.filenames.get(4));
+       assertEquals("image3.png", handler.filenames.get(5));
+       assertEquals("image4.emf", handler.filenames.get(6));
+       assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(7));
+       assertEquals("image5.emf", handler.filenames.get(8));
        // But we do know their types
        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(1));  // Icon of embedded office doc
@@ -208,9 +217,6 @@ public class OOXMLContainerExtractionTes
        assertEquals(14, handler.filenames.size());
        assertEquals(14, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
        // But we do know their types
        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
         assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside .pptx
@@ -233,9 +239,16 @@ public class OOXMLContainerExtractionTes
        assertEquals(9, handler.filenames.size());
        assertEquals(9, handler.mediaTypes.size());
        
-       // We don't know their filenames
-       for(String filename : handler.filenames)
-          assertEquals(null, filename);
+       // We don't know their exact filenames
+       assertEquals("image4.png", handler.filenames.get(0));
+       assertEquals("image5.gif", handler.filenames.get(1));
+       assertEquals("image6.png", handler.filenames.get(2));
+       assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3));
+       assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4));
+       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5));
+       assertEquals("image1.emf", handler.filenames.get(6));
+       assertEquals("image2.emf", handler.filenames.get(7));
+       assertEquals("image3.emf", handler.filenames.get(8));
        // But we do know their types
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));  // Embedded image
        assertEquals(TYPE_GIF, handler.mediaTypes.get(1));  // Embedded image

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1001209&r1=1001208&r2=1001209&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sat Sep 25 13:34:55 2010
@@ -17,8 +17,14 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.Locale;
 
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
 import junit.framework.TestCase;
 
 import org.apache.tika.config.TikaConfig;
@@ -26,13 +32,12 @@ import org.apache.tika.detect.ContainerA
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
-import org.apache.tika.parser.AutoDetectParser;
-
 public class OOXMLParserTest extends TestCase {
     private Parser parser;
    
@@ -214,6 +219,10 @@ public class OOXMLParserTest extends Tes
 	}
     }
     
+    /**
+     * Test the plain text output of the Word converter
+     * @throws Exception
+     */
     public void testWord() throws Exception {
         InputStream input = OOXMLParserTest.class
                 .getResourceAsStream("/test-documents/testWORD.docx");
@@ -224,7 +233,6 @@ public class OOXMLParserTest extends Tes
 
         try {
             parser.parse(TikaInputStream.get(input), handler, metadata, context);
-
             assertEquals(
                     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                     metadata.get(Metadata.CONTENT_TYPE));
@@ -237,6 +245,74 @@ public class OOXMLParserTest extends Tes
     }
 
     /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    public void testWordHTML() throws Exception {
+        InputStream input = null;
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        // Try with a document containing various tables and formattings
+        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.docx");
+        try {
+            parser.parse(TikaInputStream.get(input), handler, metadata, context);
+            String xml = sw.toString();
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            assertTrue(xml.contains("Sample Word Document"));
+            
+            // Check that custom headings came through
+            assertTrue(xml.contains("<h1 class=\"title\">"));
+            // Regular headings
+            assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+            assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+            // Bold and italic
+            assertTrue(xml.contains("<b>BOLD</b>"));
+            assertTrue(xml.contains("<i>ITALIC</i>"));
+            // Table
+            assertTrue(xml.contains("<table>"));
+            assertTrue(xml.contains("<td>"));
+            // Links
+            assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+            // Paragraphs with other styles
+            assertTrue(xml.contains("<p class=\"signature\">This one"));
+        } finally {
+            input.close();
+        }
+        
+        // Try with a document that contains images
+        sw = new StringWriter();
+        handler.setResult(new StreamResult(sw));
+        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.docx");
+        try {
+            parser.parse(TikaInputStream.get(input), handler, metadata, context);
+            String xml = sw.toString();
+            
+            // Images 2-4 (there is no 1!)
+//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\"/>"));
+//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\"/>"));
+//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\"/>"));
+            
+            // Text too
+            assertTrue(xml.contains("<p>The end!</p>"));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
      * Documents with some sheets are protected, but not all. 
      * See TIKA-364.
      */