You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/17 11:20:53 UTC
svn commit: r1171929 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/resources/tes...
Author: jukka
Date: Sat Sep 17 09:20:53 2011
New Revision: 1171929
URL: http://svn.apache.org/viewvc?rev=1171929&view=rev
Log:
TIKA-692: TikaCLI -x or -h on a Word doc sometimes adds newline after </b> tag
Patch by Michael McCandless
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx (with props)
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Sat Sep 17 09:20:53 2011
@@ -146,7 +146,7 @@ public class TikaCLI {
@Override
protected ContentHandler getContentHandler(OutputStream output)
throws Exception {
- return getTransformerHandler(output, "xml", encoding);
+ return getTransformerHandler(output, "xml", encoding, prettyPrint);
}
};
@@ -154,7 +154,7 @@ public class TikaCLI {
@Override
protected ContentHandler getContentHandler(OutputStream output)
throws Exception {
- return getTransformerHandler(output, "html", encoding);
+ return getTransformerHandler(output, "html", encoding, prettyPrint);
}
};
@@ -267,6 +267,8 @@ public class TikaCLI {
private boolean fork = false;
private String profileName = null;
+
+ private boolean prettyPrint;
public TikaCLI() throws Exception {
context = new ParseContext();
@@ -324,6 +326,8 @@ public class TikaCLI {
} else if (arg.equals("-z") || arg.equals("--extract")) {
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
+ } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
+ prettyPrint = true;
} else if (arg.equals("-p") || arg.equals("--port")
|| arg.equals("-s") || arg.equals("--server")) {
serverMode = true;
@@ -374,22 +378,24 @@ public class TikaCLI {
out.println("usage: java -jar tika-app.jar [option...] [file|port...]");
out.println();
out.println("Options:");
- out.println(" -? or --help Print this usage message");
- out.println(" -v or --verbose Print debug level messages");
+ out.println(" -? or --help Print this usage message");
+ out.println(" -v or --verbose Print debug level messages");
out.println();
- out.println(" -g or --gui Start the Apache Tika GUI");
- out.println(" -s or --server Start the Apache Tika server");
+ out.println(" -g or --gui Start the Apache Tika GUI");
+ out.println(" -s or --server Start the Apache Tika server");
out.println();
- out.println(" -x or --xml Output XHTML content (default)");
- out.println(" -h or --html Output HTML content");
- out.println(" -j or --json Output JSON content");
- out.println(" -t or --text Output plain text content");
- out.println(" -T or --text-main Output plain text content (main content only)");
- out.println(" -m or --metadata Output only metadata");
- out.println(" -l or --language Output only language");
- out.println(" -d or --detect Detect document type");
- out.println(" -eX or --encoding=X Use output encoding X");
- out.println(" -z or --extract Extract all attachements into current directory");
+ out.println(" -x or --xml Output XHTML content (default)");
+ out.println(" -h or --html Output HTML content");
+ out.println(" -j or --json Output JSON content");
+ out.println(" -t or --text Output plain text content");
+ out.println(" -T or --text-main Output plain text content (main content only)");
+ out.println(" -m or --metadata Output only metadata");
+ out.println(" -l or --language Output only language");
+ out.println(" -d or --detect Detect document type");
+ out.println(" -eX or --encoding=X Use output encoding X");
+ out.println(" -z or --extract Extract all attachements into current directory");
+ out.println(" -r or --pretty-print For XML and XHTML outputs, adds newlines and");
+ out.println(" whitespace, for better readability");
out.println();
out.println(" --create-profile=X");
out.println(" Create NGram profile, where X is a profile name");
@@ -576,13 +582,13 @@ public class TikaCLI {
* if the transformer can not be created
*/
private static TransformerHandler getTransformerHandler(
- OutputStream output, String method, String encoding)
+ OutputStream output, String method, String encoding, boolean prettyPrint)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
if (encoding != null) {
handler.getTransformer().setOutputProperty(
OutputKeys.ENCODING, encoding);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sat Sep 17 09:20:53 2011
@@ -126,35 +126,51 @@ public class XWPFWordExtractorDecorator
xhtml.endElement("a");
}
+ // True if we are currently in the named style tag:
+ boolean curBold = false;
+ boolean curItalic = false;
+
// Do the text
for(XWPFRun run : paragraph.getRuns()) {
- List<String> tags = new ArrayList<String>();
+ System.out.println("RUN: " + run.toString());
+
+ if (run.isBold() != curBold) {
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (run.isBold()) {
+ xhtml.startElement("b");
+ } else {
+ xhtml.endElement("b");
+ }
+ curBold = run.isBold();
+ }
+
+ if (run.isItalic() != curItalic) {
+ if (run.isItalic()) {
+ xhtml.startElement("i");
+ } else {
+ xhtml.endElement("i");
+ }
+ curItalic = run.isItalic();
+ }
+
+ boolean addedHREF = false;
if(run instanceof XWPFHyperlinkRun) {
XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
XWPFHyperlink link = linkRun.getHyperlink(document);
if(link != null && link.getURL() != null) {
xhtml.startElement("a", "href", link.getURL());
- tags.add("a");
+ addedHREF = true;
} else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
- tags.add("a");
+ addedHREF = true;
}
}
- if(run.isBold()) {
- xhtml.startElement("b");
- tags.add("b");
- }
- if(run.isItalic()) {
- xhtml.startElement("i");
- tags.add("i");
- }
-
+
xhtml.characters(run.toString());
- for(int i=tags.size()-1; i>=0; i--) {
- xhtml.endElement(tags.get(i));
- }
-
// If we have any pictures, output them
for(XWPFPicture picture : run.getEmbeddedPictures()) {
if(paragraph.getDocument() != null) {
@@ -170,6 +186,20 @@ public class XWPFWordExtractorDecorator
}
}
}
+
+ if (addedHREF) {
+ xhtml.endElement("a");
+ }
+ }
+
+ // Close any still open style tags
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (curBold) {
+ xhtml.endElement("b");
+ curBold = false;
}
// Now do any comments for the paragraph
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1171929&r1=1171928&r2=1171929&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sat Sep 17 09:20:53 2011
@@ -278,14 +278,19 @@ public class OOXMLParserTest extends Tik
}
}
- /**
- * Test that the word converter is able to generate the
- * correct HTML for the document
- */
- public void testWordHTML() throws Exception {
+ private static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ private XMLResult getXML(String filePath) throws Exception {
InputStream input = null;
Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
@@ -296,58 +301,79 @@ public class OOXMLParserTest extends Tik
handler.setResult(new StreamResult(sw));
// Try with a document containing various tables and formattings
- input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.docx");
+ input = OOXMLParserTest.class.getResourceAsStream(filePath);
try {
- parser.parse(TikaInputStream.get(input), handler, metadata, context);
- String xml = sw.toString();
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- assertTrue(xml.contains("Sample Word Document"));
-
- // Check that custom headings came through
- assertTrue(xml.contains("<h1 class=\"title\">"));
- // Regular headings
- assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
- assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
- // Headings with anchor tags in them
- assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
- // Bold and italic
- assertTrue(xml.contains("<b>BOLD</b>"));
- assertTrue(xml.contains("<i>ITALIC</i>"));
- // Table
- assertTrue(xml.contains("<table>"));
- assertTrue(xml.contains("<td>"));
- // Links
- assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
- // Anchor links
- assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
- // Paragraphs with other styles
- assertTrue(xml.contains("<p class=\"signature\">This one"));
+ parser.parse(TikaInputStream.get(input), handler, metadata, new ParseContext());
+ return new XMLResult(sw.toString(), metadata);
} finally {
input.close();
}
-
- // Try with a document that contains images
- sw = new StringWriter();
- handler.setResult(new StreamResult(sw));
- input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.docx");
- try {
- parser.parse(TikaInputStream.get(input), handler, metadata, context);
- String xml = sw.toString();
+ }
+
+ /**
+ * Test that the word converter is able to generate the
+ * correct HTML for the document
+ */
+ public void testWordHTML() throws Exception {
+
+ XMLResult result = getXML("/test-documents/testWORD.docx");
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(xml.contains("Sample Word Document"));
- // Images 2-4 (there is no 1!)
-// assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\"/>"));
-// assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\"/>"));
-// assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\"/>"));
+ // Check that custom headings came through
+ assertTrue(xml.contains("<h1 class=\"title\">"));
+ // Regular headings
+ assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+ assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+ // Headings with anchor tags in them
+ assertTrue(xml.replaceAll("\r?\n", "").contains("<h3><a name=\"OnLevel3\"/>Heading Level 3</h3>"));
+ // Bold and italic
+ assertTrue(xml.contains("<b>BOLD</b>"));
+ assertTrue(xml.contains("<i>ITALIC</i>"));
+ // Table
+ assertTrue(xml.contains("<table>"));
+ assertTrue(xml.contains("<td>"));
+ // Links
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Anchor links
+ assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+ // Paragraphs with other styles
+ assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+ result = getXML("/test-documents/testWORD_3imgs.docx");
+ xml = result.xml;
+
+ // Images 2-4 (there is no 1!)
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\"/>"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\"/>"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\"/>"));
- // Text too
- assertTrue(xml.contains("<p>The end!</p>"));
- } finally {
- input.close();
- }
+ // Text too
+ assertTrue(xml.contains("<p>The end!</p>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("/test-documents/testWORD_bold_character_runs.docx").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("/test-documents/testWORD_bold_character_runs2.docx").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
/**
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx?rev=1171929&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx Sat Sep 17 09:20:53 2011 differ
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.docx
------------------------------------------------------------------------------
svn:executable = *
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx?rev=1171929&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx Sat Sep 17 09:20:53 2011 differ
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.docx
------------------------------------------------------------------------------
svn:executable = *