You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/08/21 15:26:34 UTC

svn commit: r1159980 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/

Author: jukka
Date: Sun Aug 21 13:26:34 2011
New Revision: 1159980

URL: http://svn.apache.org/viewvc?rev=1159980&view=rev
Log:
TIKA-692: TikaCLI -x or -h on a Word doc sometimes adds newline after </b> tag

Patch by Michael McCandless

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1159980&r1=1159979&r2=1159980&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Sun Aug 21 13:26:34 2011
@@ -54,6 +54,11 @@ public class WordExtractor extends Abstr
         super(context);
     }
 
+    // True if we are currently in the named style tag:
+    private boolean curStrikeThrough;
+    private boolean curBold;
+    private boolean curItalic;
+
     protected void parse(
             NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
@@ -187,6 +192,20 @@ public class WordExtractor extends Abstr
           }
        }
        
+       // Close any still open style tags
+       if (curStrikeThrough) {
+         xhtml.endElement("s");
+         curStrikeThrough = false;
+       }
+       if (curItalic) {
+         xhtml.endElement("i");
+         curItalic = false;
+       }
+       if (curBold) {
+         xhtml.endElement("b");
+         curBold = false;
+       }
+
        xhtml.endElement(tas.getTag());
        
        return 0;
@@ -198,14 +217,47 @@ public class WordExtractor extends Abstr
        if(cr.text().equals("\r"))
           return;
        
-       List<String> tags = new ArrayList<String>();
        if(!skipStyling) {
-          if(cr.isBold()) tags.add("b");
-          if(cr.isItalic()) tags.add("i");
-          if(cr.isStrikeThrough()) tags.add("s");
-          for(String tag : tags) {
-             xhtml.startElement(tag);
-          }
+         if (cr.isBold() != curBold) {
+           // Enforce nesting -- must close s and i tags
+           if (curStrikeThrough) {
+             xhtml.endElement("s");
+             curStrikeThrough = false;
+           }
+           if (curItalic) {
+             xhtml.endElement("i");
+             curItalic = false;
+           }
+           if (cr.isBold()) {
+             xhtml.startElement("b");
+           } else {
+             xhtml.endElement("b");
+           }
+           curBold = cr.isBold();
+         }
+
+         if (cr.isItalic() != curItalic) {
+           // Enforce nesting -- must close s tag
+           if (curStrikeThrough) {
+             xhtml.endElement("s");
+             curStrikeThrough = false;
+           }
+           if (cr.isItalic()) {
+             xhtml.startElement("i");
+           } else {
+             xhtml.endElement("i");
+           }
+           curItalic = cr.isItalic();
+         }
+
+         if (cr.isStrikeThrough() != curStrikeThrough) {
+           if (cr.isStrikeThrough()) {
+             xhtml.startElement("s");
+           } else {
+             xhtml.endElement("s");
+           }
+           curStrikeThrough = cr.isStrikeThrough();
+         }
        }
        
        // Clean up the text
@@ -217,10 +269,6 @@ public class WordExtractor extends Abstr
        }
        
        xhtml.characters(text);
-
-       for(int tn=tags.size()-1; tn>=0; tn--) {
-          xhtml.endElement(tags.get(tn));
-       }
     }
     /**
      * Can be \13..text..\15 or \13..control..\14..text..\15 .

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1159980&r1=1159979&r2=1159980&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sun Aug 21 13:26:34 2011
@@ -68,11 +68,17 @@ public class WordParserTest extends Test
         }
     }
 
-    /**
-     * Test that the word converter is able to generate the
-     *  correct HTML for the document
-     */
-    public void testWordHTML() throws Exception {
+    private static class XMLResult {
+        public final String xml;
+        public final Metadata metadata;
+
+        public XMLResult(String xml, Metadata metadata) {
+            this.xml = xml;
+            this.metadata = metadata;
+      }
+    }
+
+    private XMLResult getXML(String filePath) throws Exception {
         InputStream input = null;
         Metadata metadata = new Metadata();
         
@@ -81,59 +87,83 @@ public class WordParserTest extends Test
                  SAXTransformerFactory.newInstance();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
         handler.setResult(new StreamResult(sw));
 
         // Try with a document containing various tables and formattings
-        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.doc");
+        input = OOXMLParserTest.class.getResourceAsStream(filePath);
         try {
             new OfficeParser().parse(input, handler, metadata, new ParseContext());
-            String xml = sw.toString();
-            assertEquals(
-                  "application/msword",
-                  metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            assertTrue(xml.contains("Sample Word Document"));
-
-            // Check that custom headings came through
-            assertTrue(xml.contains("<h1 class=\"title\">"));
-            // Regular headings
-            assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
-            assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
-            // Bold and italic
-            assertTrue(xml.contains("<b>BOLD</b>"));
-            assertTrue(xml.contains("<i>ITALIC</i>"));
-            // Table
-            assertTrue(xml.contains("<table>"));
-            assertTrue(xml.contains("<td>"));
-            // TODO - Check for the nested table
-            // Links
-            assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
-            // Paragraphs with other styles
-            assertTrue(xml.contains("<p class=\"signature\">This one"));
+            return new XMLResult(sw.toString(), metadata);
         } finally {
             input.close();
         }
+    }
+
+    /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    public void testWordHTML() throws Exception {
+
+        // Try with a document containing various tables and
+        // formattings
+        XMLResult result = getXML("/test-documents/testWORD.doc");
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                     "application/msword",
+                     metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+        assertTrue(xml.contains("Sample Word Document"));
+
+        // Check that custom headings came through
+        assertTrue(xml.contains("<h1 class=\"title\">"));
+        // Regular headings
+        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+        assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+        // Bold and italic
+        assertTrue(xml.contains("<b>BOLD</b>"));
+        assertTrue(xml.contains("<i>ITALIC</i>"));
+        // Table
+        assertTrue(xml.contains("<table>"));
+        assertTrue(xml.contains("<td>"));
+        // TODO - Check for the nested table
+        // Links
+        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+        // Paragraphs with other styles
+        assertTrue(xml.contains("<p class=\"signature\">This one"));
         
         // Try with a document that contains images
-        sw = new StringWriter();
-        handler.setResult(new StreamResult(sw));
-        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.doc");
-        try {
-            new OfficeParser().parse(TikaInputStream.get(input), handler, metadata, new ParseContext());
-            String xml = sw.toString();
+        xml = getXML("/test-documents/testWORD_3imgs.doc").xml;
+
+        // Images 1-3
+        assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
+        assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
+        assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
             
-            // Images 1-3
-            assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
-            assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
-            assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
+        // Text too
+        assertTrue(xml.contains("<p>The end!"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("/test-documents/testWORD_bold_character_runs.doc").xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("/test-documents/testWORD_bold_character_runs2.doc").xml;
             
-            // Text too
-            assertTrue(xml.contains("<p>The end!"));
-        } finally {
-            input.close();
-        }
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
     public void testWord6Parser() throws Exception {

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc?rev=1159980&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc Sun Aug 21 13:26:34 2011 differ

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc
------------------------------------------------------------------------------
    svn:executable = *

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc?rev=1159980&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc Sun Aug 21 13:26:34 2011 differ

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc
------------------------------------------------------------------------------
    svn:executable = *