You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/11/12 17:40:43 UTC

svn commit: r1034463 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ main/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/resources/test-documents/

Author: nick
Date: Fri Nov 12 16:40:43 2010
New Revision: 1034463

URL: http://svn.apache.org/viewvc?rev=1034463&view=rev
Log:
TIKA-552 - Handle word styles like "heading 4" just like "Heading 4", and in .docx files insert bookmarks as anchor tags, along with relative hyperlinks for the text that references them.
(Updates the .doc test file to include bookmarks, but there's no .doc handling of them yet)

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Nov 12 16:40:43 2010
@@ -367,9 +367,10 @@ public class WordExtractor extends Abstr
           // Already setup
        } else if(styleName.equals("Table Contents") && isTable) {
           // Already setup
-       } else if(styleName.equals("Heading")) {
+       } else if(styleName.equals("heading") || styleName.equals("Heading")) {
           tag = "h1";
-       } else if(styleName.startsWith("Heading ")) {
+       } else if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+          // "Heading 3" or "Heading2" or "heading 4"
           int num = 1;
           try {
              num = Integer.parseInt( 
@@ -390,7 +391,7 @@ public class WordExtractor extends Abstr
           styleClass = styleClass.substring(0,1).toLowerCase() +
                          styleClass.substring(1);
        }
-       
+
        return new TagAndStyle(tag,styleClass);
     }
     

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Nov 12 16:40:43 2010
@@ -120,16 +120,20 @@ public class XWPFWordExtractorDecorator 
           styleClass = tas.getStyleClass();
        }
        
-       for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
-           xhtml.element("p", bookmark.getName());
-       }
-
        if(styleClass == null) {
           xhtml.startElement(tag);
        } else {
           xhtml.startElement(tag, "class", styleClass);
        }
        
+       // Attach bookmarks for the paragraph
+       // (In future, we might put them in the right place, for now
+       //  we just put them in the correct paragraph)
+       for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
+          xhtml.startElement("a", "name", bookmark.getName());
+          xhtml.endElement("a");
+       }
+       
        // Do the text
        for(XWPFRun run : paragraph.getRuns()) {
           List<String> tags = new ArrayList<String>();
@@ -139,6 +143,9 @@ public class XWPFWordExtractorDecorator 
              if(link != null && link.getURL() != null) {
                 xhtml.startElement("a", "href", link.getURL());
                 tags.add("a");
+             } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
+                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+                tags.add("a");
              }
           }
           if(run.isBold()) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Nov 12 16:40:43 2010
@@ -300,7 +300,9 @@ public class OOXMLParserTest extends Tes
             assertTrue(xml.contains("<h1 class=\"title\">"));
             // Regular headings
             assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
-            assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+            assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+            // Headings with anchor tags in them
+            assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
             // Bold and italic
             assertTrue(xml.contains("<b>BOLD</b>"));
             assertTrue(xml.contains("<i>ITALIC</i>"));
@@ -309,6 +311,8 @@ public class OOXMLParserTest extends Tes
             assertTrue(xml.contains("<td>"));
             // Links
             assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+            // Anchor links
+            assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
             // Paragraphs with other styles
             assertTrue(xml.contains("<p class=\"signature\">This one"));
         } finally {

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
Binary files - no diff available.

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
Binary files - no diff available.