You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/17 11:20:53 UTC

svn commit: r1171929 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/resources/tes...

Author: jukka
Date: Sat Sep 17 09:20:53 2011
New Revision: 1171929

URL: http://svn.apache.org/viewvc?rev=1171929&view=rev
Log:
TIKA-692: TikaCLI -x or -h on a Word doc sometimes adds newline after </b> tag

Patch by Michael McCandless

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx   (with props)
Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Sat Sep 17 09:20:53 2011
@@ -146,7 +146,7 @@ public class TikaCLI {
         @Override
         protected ContentHandler getContentHandler(OutputStream output)
                 throws Exception {
-            return getTransformerHandler(output, "xml", encoding);
+            return getTransformerHandler(output, "xml", encoding, prettyPrint);
         }
     };
 
@@ -154,7 +154,7 @@ public class TikaCLI {
         @Override
         protected ContentHandler getContentHandler(OutputStream output)
                 throws Exception {
-            return getTransformerHandler(output, "html", encoding);
+            return getTransformerHandler(output, "html", encoding, prettyPrint);
         }
     };
 
@@ -267,6 +267,8 @@ public class TikaCLI {
     private boolean fork = false;
 
     private String profileName = null;
+
+    private boolean prettyPrint;
     
     public TikaCLI() throws Exception {
         context = new ParseContext();
@@ -324,6 +326,8 @@ public class TikaCLI {
         } else if (arg.equals("-z") || arg.equals("--extract")) {
             type = NO_OUTPUT;
             context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
+        } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
+            prettyPrint = true;
         } else if (arg.equals("-p") || arg.equals("--port")
                 || arg.equals("-s") || arg.equals("--server")) {
             serverMode = true;
@@ -374,22 +378,24 @@ public class TikaCLI {
         out.println("usage: java -jar tika-app.jar [option...] [file|port...]");
         out.println();
         out.println("Options:");
-        out.println("    -?  or --help        Print this usage message");
-        out.println("    -v  or --verbose     Print debug level messages");
+        out.println("    -?  or --help          Print this usage message");
+        out.println("    -v  or --verbose       Print debug level messages");
         out.println();
-        out.println("    -g  or --gui         Start the Apache Tika GUI");
-        out.println("    -s  or --server      Start the Apache Tika server");
+        out.println("    -g  or --gui           Start the Apache Tika GUI");
+        out.println("    -s  or --server        Start the Apache Tika server");
         out.println();
-        out.println("    -x  or --xml         Output XHTML content (default)");
-        out.println("    -h  or --html        Output HTML content");
-        out.println("    -j  or --json        Output JSON content");
-        out.println("    -t  or --text        Output plain text content");
-        out.println("    -T  or --text-main   Output plain text content (main content only)");
-        out.println("    -m  or --metadata    Output only metadata");
-        out.println("    -l  or --language    Output only language");
-        out.println("    -d  or --detect      Detect document type");
-        out.println("    -eX or --encoding=X  Use output encoding X");
-        out.println("    -z  or --extract     Extract all attachements into current directory");        
+        out.println("    -x  or --xml           Output XHTML content (default)");
+        out.println("    -h  or --html          Output HTML content");
+        out.println("    -j  or --json          Output JSON content");
+        out.println("    -t  or --text          Output plain text content");
+        out.println("    -T  or --text-main     Output plain text content (main content only)");
+        out.println("    -m  or --metadata      Output only metadata");
+        out.println("    -l  or --language      Output only language");
+        out.println("    -d  or --detect        Detect document type");
+        out.println("    -eX or --encoding=X    Use output encoding X");
+        out.println("    -z  or --extract       Extract all attachements into current directory");        
+        out.println("    -r  or --pretty-print  For XML and XHTML outputs, adds newlines and");
+        out.println("                           whitespace, for better readability");
         out.println();
         out.println("    --create-profile=X");
         out.println("         Create NGram profile, where X is a profile name");
@@ -576,13 +582,13 @@ public class TikaCLI {
      *         if the transformer can not be created
      */
     private static TransformerHandler getTransformerHandler(
-            OutputStream output, String method, String encoding)
+            OutputStream output, String method, String encoding, boolean prettyPrint)
             throws TransformerConfigurationException {
         SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
         if (encoding != null) {
             handler.getTransformer().setOutputProperty(
                     OutputKeys.ENCODING, encoding);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sat Sep 17 09:20:53 2011
@@ -126,35 +126,51 @@ public class XWPFWordExtractorDecorator 
           xhtml.endElement("a");
        }
        
+       // True if we are currently in the named style tag:
+       boolean curBold = false;
+       boolean curItalic = false;
+
        // Do the text
        for(XWPFRun run : paragraph.getRuns()) {
-          List<String> tags = new ArrayList<String>();
+         System.out.println("RUN: " + run.toString());
+
+          if (run.isBold() != curBold) {
+            if (curItalic) {
+              xhtml.endElement("i");
+              curItalic = false;
+            }
+            if (run.isBold()) {
+              xhtml.startElement("b");
+            } else {
+              xhtml.endElement("b");
+            }
+            curBold = run.isBold();
+          }
+
+          if (run.isItalic() != curItalic) {
+            if (run.isItalic()) {
+              xhtml.startElement("i");
+            } else {
+              xhtml.endElement("i");
+            }
+            curItalic = run.isItalic();
+          }
+
+          boolean addedHREF = false;
           if(run instanceof XWPFHyperlinkRun) {
              XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
              XWPFHyperlink link = linkRun.getHyperlink(document);
              if(link != null && link.getURL() != null) {
                 xhtml.startElement("a", "href", link.getURL());
-                tags.add("a");
+                addedHREF = true;
              } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
                 xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
-                tags.add("a");
+                addedHREF = true;
              }
           }
-          if(run.isBold()) {
-             xhtml.startElement("b");
-             tags.add("b");
-          }
-          if(run.isItalic()) {
-             xhtml.startElement("i");
-             tags.add("i");
-          }
-          
+
           xhtml.characters(run.toString());
           
-          for(int i=tags.size()-1; i>=0; i--) {
-             xhtml.endElement(tags.get(i));
-          }
-          
           // If we have any pictures, output them
           for(XWPFPicture picture : run.getEmbeddedPictures()) {
              if(paragraph.getDocument() != null) {
@@ -170,6 +186,20 @@ public class XWPFWordExtractorDecorator 
                 }
              }
           }
+
+          if (addedHREF) {
+            xhtml.endElement("a");
+          }
+       }
+       
+       // Close any still open style tags
+       if (curItalic) {
+         xhtml.endElement("i");
+         curItalic = false;
+       }
+       if (curBold) {
+         xhtml.endElement("b");
+         curBold = false;
        }
        
        // Now do any comments for the paragraph

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sat Sep 17 09:20:53 2011
@@ -278,14 +278,19 @@ public class OOXMLParserTest extends Tik
         }
     }
 
-    /**
-     * Test that the word converter is able to generate the
-     *  correct HTML for the document
-     */
-    public void testWordHTML() throws Exception {
+    private static class XMLResult {
+        public final String xml;
+        public final Metadata metadata;
+
+        public XMLResult(String xml, Metadata metadata) {
+            this.xml = xml;
+            this.metadata = metadata;
+      }
+    }
+
+    private XMLResult getXML(String filePath) throws Exception {
         InputStream input = null;
         Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
         
         StringWriter sw = new StringWriter();
         SAXTransformerFactory factory = (SAXTransformerFactory)
@@ -296,58 +301,79 @@ public class OOXMLParserTest extends Tik
         handler.setResult(new StreamResult(sw));
 
         // Try with a document containing various tables and formattings
-        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.docx");
+        input = OOXMLParserTest.class.getResourceAsStream(filePath);
         try {
-            parser.parse(TikaInputStream.get(input), handler, metadata, context);
-            String xml = sw.toString();
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            assertTrue(xml.contains("Sample Word Document"));
-            
-            // Check that custom headings came through
-            assertTrue(xml.contains("<h1 class=\"title\">"));
-            // Regular headings
-            assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
-            assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
-            // Headings with anchor tags in them
-            assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
-            // Bold and italic
-            assertTrue(xml.contains("<b>BOLD</b>"));
-            assertTrue(xml.contains("<i>ITALIC</i>"));
-            // Table
-            assertTrue(xml.contains("<table>"));
-            assertTrue(xml.contains("<td>"));
-            // Links
-            assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
-            // Anchor links
-            assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
-            // Paragraphs with other styles
-            assertTrue(xml.contains("<p class=\"signature\">This one"));
+            parser.parse(TikaInputStream.get(input), handler, metadata, new ParseContext());
+            return new XMLResult(sw.toString(), metadata);
         } finally {
             input.close();
         }
-        
-        // Try with a document that contains images
-        sw = new StringWriter();
-        handler.setResult(new StreamResult(sw));
-        input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.docx");
-        try {
-            parser.parse(TikaInputStream.get(input), handler, metadata, context);
-            String xml = sw.toString();
+    }
+
+    /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    public void testWordHTML() throws Exception {
+
+      XMLResult result = getXML("/test-documents/testWORD.docx");
+      String xml = result.xml;
+      Metadata metadata = result.metadata;
+      assertEquals(
+                   "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                   metadata.get(Metadata.CONTENT_TYPE));
+      assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+      assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+      assertTrue(xml.contains("Sample Word Document"));
             
-            // Images 2-4 (there is no 1!)
-//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\"/>"));
-//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\"/>"));
-//            assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\"/>"));
+      // Check that custom headings came through
+      assertTrue(xml.contains("<h1 class=\"title\">"));
+      // Regular headings
+      assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+      assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+      // Headings with anchor tags in them
+      assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
+      // Bold and italic
+      assertTrue(xml.contains("<b>BOLD</b>"));
+      assertTrue(xml.contains("<i>ITALIC</i>"));
+      // Table
+      assertTrue(xml.contains("<table>"));
+      assertTrue(xml.contains("<td>"));
+      // Links
+      assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+      // Anchor links
+      assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+      // Paragraphs with other styles
+      assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+      result = getXML("/test-documents/testWORD_3imgs.docx");
+      xml = result.xml;
+
+      // Images 2-4 (there is no 1!)
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\"/>"));
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\"/>"));
+      assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\"/>"));
             
-            // Text too
-            assertTrue(xml.contains("<p>The end!</p>"));
-        } finally {
-            input.close();
-        }
+      // Text too
+      assertTrue(xml.contains("<p>The end!</p>"));
+
+      // TIKA-692: test document containing multiple
+      // character runs within a bold tag:
+      xml = getXML("/test-documents/testWORD_bold_character_runs.docx").xml;
+
+      // Make sure bold text arrived as single
+      // contiguous string even though Word parser
+      // handled this as 3 character runs
+      assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+      // TIKA-692: test document containing multiple
+      // character runs within a bold tag:
+      xml = getXML("/test-documents/testWORD_bold_character_runs2.docx").xml;
+            
+      // Make sure bold text arrived as single
+      // contiguous string even though Word parser
+      // handled this as 3 character runs
+      assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
     /**

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx?rev=1171929&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx Sat Sep 17 09:20:53 2011 differ

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx
------------------------------------------------------------------------------
    svn:executable = *

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx?rev=1171929&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx Sat Sep 17 09:20:53 2011 differ

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx
------------------------------------------------------------------------------
    svn:executable = *