You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/11/12 17:40:43 UTC
svn commit: r1034463 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/
main/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/parser/microsoft/ooxml/
test/resources/test-documents/
Author: nick
Date: Fri Nov 12 16:40:43 2010
New Revision: 1034463
URL: http://svn.apache.org/viewvc?rev=1034463&view=rev
Log:
TIKA-552 - Handle word styles like "heading 4" just like "Heading 4", and in .docx files insert bookmarks as anchor tags, along with relative hyperlinks for the text that references them.
(Updates the .doc test file to include bookmarks, but there's no .doc handling of them yet)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Nov 12 16:40:43 2010
@@ -367,9 +367,10 @@ public class WordExtractor extends Abstr
// Already setup
} else if(styleName.equals("Table Contents") && isTable) {
// Already setup
- } else if(styleName.equals("Heading")) {
+ } else if(styleName.equals("heading") || styleName.equals("Heading")) {
tag = "h1";
- } else if(styleName.startsWith("Heading ")) {
+ } else if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+ // "Heading 3" or "Heading2" or "heading 4"
int num = 1;
try {
num = Integer.parseInt(
@@ -390,7 +391,7 @@ public class WordExtractor extends Abstr
styleClass = styleClass.substring(0,1).toLowerCase() +
styleClass.substring(1);
}
-
+
return new TagAndStyle(tag,styleClass);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Nov 12 16:40:43 2010
@@ -120,16 +120,20 @@ public class XWPFWordExtractorDecorator
styleClass = tas.getStyleClass();
}
- for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
- xhtml.element("p", bookmark.getName());
- }
-
if(styleClass == null) {
xhtml.startElement(tag);
} else {
xhtml.startElement(tag, "class", styleClass);
}
+ // Attach bookmarks for the paragraph
+ // (In future, we might put them in the right place, for now
+ // we just put them in the correct paragraph)
+ for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
+ xhtml.startElement("a", "name", bookmark.getName());
+ xhtml.endElement("a");
+ }
+
// Do the text
for(XWPFRun run : paragraph.getRuns()) {
List<String> tags = new ArrayList<String>();
@@ -139,6 +143,9 @@ public class XWPFWordExtractorDecorator
if(link != null && link.getURL() != null) {
xhtml.startElement("a", "href", link.getURL());
tags.add("a");
+ } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
+ xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+ tags.add("a");
}
}
if(run.isBold()) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Nov 12 16:40:43 2010
@@ -300,7 +300,9 @@ public class OOXMLParserTest extends Tes
assertTrue(xml.contains("<h1 class=\"title\">"));
// Regular headings
assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
- assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+ assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+ // Headings with anchor tags in them
+ assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
// Bold and italic
assertTrue(xml.contains("<b>BOLD</b>"));
assertTrue(xml.contains("<i>ITALIC</i>"));
@@ -309,6 +311,8 @@ public class OOXMLParserTest extends Tes
assertTrue(xml.contains("<td>"));
// Links
assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Anchor links
+ assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
// Paragraphs with other styles
assertTrue(xml.contains("<p class=\"signature\">This one"));
} finally {
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.doc?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
Binary files - no diff available.
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD.docx?rev=1034463&r1=1034462&r2=1034463&view=diff
==============================================================================
Binary files - no diff available.