You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/24 13:58:32 UTC
[1/2] tika git commit: TIKA-2019 -- parsers for 2003 MS xml files
fail to add spaces/tabs correctly when using the ToTextHandler
Repository: tika
Updated Branches:
refs/heads/master 52ea9ba7c -> 81279a1e0
TIKA-2019 -- parsers for 2003 MS xml files fail to add spaces/tabs correctly when using the ToTextHandler
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ae760e2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ae760e2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ae760e2
Branch: refs/heads/master
Commit: 7ae760e29ad3ed5874f7f50c27c6f850ab1d8025
Parents: d6981ad
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 09:58:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 09:58:00 2016 -0400
----------------------------------------------------------------------
.../microsoft/xml/AbstractXML2003Parser.java | 56 +++++++++++++++++---
.../microsoft/xml/SpreadsheetMLParser.java | 43 ++++++++++-----
.../tika/parser/microsoft/xml/WordMLParser.java | 56 ++++++++++++++++----
.../parser/microsoft/xml/XML2003ParserTest.java | 33 ++++++++++--
4 files changed, 152 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7ae760e2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index 277ac43..637b4d6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -16,22 +16,32 @@
*/
package org.apache.tika.parser.microsoft.xml;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.ElementMetadataHandler;
-import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import java.io.IOException;
+import java.io.InputStream;
-public abstract class AbstractXML2003Parser extends XMLParser {
+
+public abstract class AbstractXML2003Parser extends AbstractParser {
protected final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
protected final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
@@ -44,9 +54,14 @@ public abstract class AbstractXML2003Parser extends XMLParser {
protected final static String BIN_DATA = "binData";
protected final static String A = "a";
- protected final static String IMG = "img";
- protected final static String HREF = "href";
+ protected final static String BODY = "body";
protected final static String CDATA = "cdata";
+ protected final static String DIV = "div";
+ protected final static String HREF = "href";
+ protected final static String IMG = "img";
+ protected final static String P = "p";
+ protected final static String TD = "td";
+ protected final static String TR = "tr";
protected final static String TABLE = "table";
protected final static String TBODY = "tbody";
@@ -62,9 +77,34 @@ public abstract class AbstractXML2003Parser extends XMLParser {
metadata, property);
}
- @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ setContentType(metadata);
+
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
+ try {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))));
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endDocument();
+ }
+ }
+
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
- ch = new TeeContentHandler(
+ //ContentHandler is not currently used, but leave that as an option for
+ //potential future additions
+ return new TeeContentHandler(
getMSPropertiesHandler(md, TikaCoreProperties.TITLE, "Title"),
getMSPropertiesHandler(md, TikaCoreProperties.CREATOR, "Author"),
getMSPropertiesHandler(md, Office.LAST_AUTHOR, "LastAuthor"),
@@ -80,7 +120,7 @@ public abstract class AbstractXML2003Parser extends XMLParser {
getMSPropertiesHandler(md, Office.LINE_COUNT, "Lines"),
getMSPropertiesHandler(md, Office.PARAGRAPH_COUNT, "Paragraphs"),
getMSPropertiesHandler(md, OfficeOpenXMLCore.VERSION, "Version"));
-
- return ch;
}
+
+ abstract protected void setContentType(Metadata contentType);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7ae760e2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java
index 73b817d..0cf7520 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java
@@ -20,6 +20,7 @@ package org.apache.tika.parser.microsoft.xml;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
@@ -41,14 +42,17 @@ import org.xml.sax.helpers.DefaultHandler;
*/
public class SpreadsheetMLParser extends AbstractXML2003Parser {
- final static String CELL = "Cell";
- final static String DATA = "Data";
- final static String ROW = "Row";
- final static String WORKSHEET = "Worksheet";
+ final static String CELL = "cell";
+ final static String DATA = "data";
+ final static String ROW = "row";
+ final static String WORKSHEET = "worksheet";
- protected static final Set<MediaType> SUPPORTED_TYPES =
+ private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-spreadsheetml");
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
- MediaType.application("vnd.ms-spreadsheetml"))));
+ MEDIA_TYPE)));
+
+ private boolean inBody = false;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -64,6 +68,11 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser {
new ExcelMLHandler(ch));
}
+ @Override
+ public void setContentType(Metadata metadata) {
+ metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
+ }
+
private class ExcelMLHandler extends DefaultHandler {
final ContentHandler handler;
StringBuilder buffer = new StringBuilder();
@@ -77,9 +86,12 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser {
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
+ localName = localName.toLowerCase(Locale.US);
if (MS_SPREADSHEET_URN.equals(uri)) {
- if ("Table".equals(localName)) {
+ if (BODY.equals(localName)) {
+ inBody = true;
+ } else if (TABLE.equals(localName)) {
handler.startElement(XHTMLContentHandler.XHTML, TABLE, TABLE, EMPTY_ATTRS);
handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS);
} else if (WORKSHEET.equals(localName)) {
@@ -91,12 +103,12 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser {
NAME_ATTR,
CDATA, worksheetName);
}
- handler.startElement(XHTMLContentHandler.XHTML, "div", "div", xhtmlAttrs);
+ handler.startElement(XHTMLContentHandler.XHTML, DIV, DIV, xhtmlAttrs);
} else if (ROW.equals(localName)) {
- handler.startElement(XHTMLContentHandler.XHTML, "tr", "tr", EMPTY_ATTRS);
+ handler.startElement(XHTMLContentHandler.XHTML, TR, TR, EMPTY_ATTRS);
} else if (CELL.equals(localName)) {
href = attrs.getValue(MS_SPREADSHEET_URN, "HRef");
- handler.startElement(XHTMLContentHandler.XHTML, "td", "td", EMPTY_ATTRS);
+ handler.startElement(XHTMLContentHandler.XHTML, TD, TD, EMPTY_ATTRS);
} else if (DATA.equals(localName)) {
inData = true;
}
@@ -107,30 +119,33 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser {
public void characters(char[] str, int offset, int len) throws SAXException {
if (inData) {
buffer.append(str, offset, len);
+ } else if (inBody) {
+ handler.characters(str, offset, len);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
+ localName = localName.toLowerCase(Locale.US);
if (MS_SPREADSHEET_URN.equals(uri)) {
- if ("Table".equals(localName)) {
+ if (TABLE.equals(localName)) {
handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY);
handler.endElement(XHTMLContentHandler.XHTML, TABLE, TABLE);
} else if (WORKSHEET.equals(localName)) {
handler.endElement(
XHTMLContentHandler.XHTML,
- "div", "div"
+ DIV, DIV
);
} else if (ROW.equals(localName)) {
handler.endElement(
XHTMLContentHandler.XHTML,
- "tr", "tr"
+ TR, TR
);
} else if (CELL.equals(localName)) {
handler.endElement(
XHTMLContentHandler.XHTML,
- "td", "td"
+ TD, TD
);
} else if (DATA.equals(localName)) {
if (href != null) {
http://git-wip-us.apache.org/repos/asf/tika/blob/7ae760e2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index b5358c7..6bd51da 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -57,14 +58,17 @@ public class WordMLParser extends AbstractXML2003Parser {
private final static Set<QName> IGNORE_CHARACTERS =
Collections.newSetFromMap(new ConcurrentHashMap<QName, Boolean>());
- protected static final Set<MediaType> SUPPORTED_TYPES =
+ private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml");
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
- MediaType.application("vnd.ms-wordml"))));
+ MEDIA_TYPE)));
+ private boolean inBody = false;
+
static {
- WORDML_TO_XHTML.put("p", "p");
+ WORDML_TO_XHTML.put(P, P);
WORDML_TO_XHTML.put("tbl", TABLE);
- WORDML_TO_XHTML.put("tr", "tr");
- WORDML_TO_XHTML.put("tc", "td");
+ WORDML_TO_XHTML.put(TR, TR);
+ WORDML_TO_XHTML.put("tc", TD);//not a typo -- table cell -> tc
IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, HLINK));
IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, PICT));
@@ -87,7 +91,6 @@ public class WordMLParser extends AbstractXML2003Parser {
ex = new ParsingEmbeddedDocumentExtractor(context);
}
-
return new TeeContentHandler(
super.getContentHandler(ch, metadata, context),
new WordMLHandler(ch),
@@ -96,10 +99,20 @@ public class WordMLParser extends AbstractXML2003Parser {
new PictHandler(ch, ex));
}
+ @Override
+ public void setContentType(Metadata metadata) {
+ metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
+ }
+
private class WordMLHandler extends DefaultHandler {
private final ContentHandler handler;
private boolean ignoreCharacters;
+ //use inP to keep track of whether the handler is
+ //in a paragraph or not. <p><p></p></p> was allowed
+ //in wordml. Use this boolean to prevent <p> within <p>
+ private boolean inP;
+
public WordMLHandler(ContentHandler handler) {
this.handler = handler;
}
@@ -107,14 +120,27 @@ public class WordMLParser extends AbstractXML2003Parser {
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
+ localName = localName.toLowerCase(Locale.US);
if (WORD_ML_URL.equals(uri)) {
+ if (BODY.equals(localName)) {
+ inBody = true;
+ return;
+ }
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
+ if ("p".equals(localName)) {
+ //close p if already in a p to prevent nested <p>
+ if (inP) {
+ handler.endElement(XHTMLContentHandler.XHTML, P, P);
+ }
+ inP = true;
+ }
handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS);
if (html.equals(TABLE)) {
handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS);
}
}
+
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
ignoreCharacters = true;
@@ -123,21 +149,31 @@ public class WordMLParser extends AbstractXML2003Parser {
@Override
public void characters(char[] str , int offset, int len) throws SAXException {
- if (!ignoreCharacters) {
+ if (!ignoreCharacters && inBody) {
handler.characters(str, offset, len);
}
}
-
-
+
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (WORD_ML_URL.equals(uri)) {
+ //for now, don't bother checking for end of body...if there's any text
+ //after the close of body, we should extract it
+ localName = localName.toLowerCase(Locale.US);
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
if (html.equals(TABLE)) {
handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY);
}
+ if ("p".equals(html) && !inP) {
+ //start p if not already in one to prevent non-matching <p>
+ handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS);
+ }
handler.endElement(XHTMLContentHandler.XHTML, html, html);
+
+ if ("p".equals(html)) {
+ inP = false;
+ }
}
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
@@ -147,7 +183,6 @@ public class WordMLParser extends AbstractXML2003Parser {
}
}
-
private class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
final ContentHandler handler;
@@ -225,5 +260,4 @@ public class WordMLParser extends AbstractXML2003Parser {
}
}
}
-
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7ae760e2/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 92f4359..a4ded7d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -20,7 +20,12 @@ import java.util.Arrays;
import java.util.List;
import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.*;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Test;
@@ -35,9 +40,14 @@ public class XML2003ParserTest extends TikaTest {
Metadata m = list.get(0);//container doc
String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+ xml = xml.replaceAll("\\s+", " ");
+ //make sure that metadata gets dumped to xml
+ assertContains("<meta name=\"meta:character-count-with-spaces\" content=\"256\"", xml);
+ //do not allow nested <p> elements
+ assertContains("<p /> <img href=\"02000003.jpg\" /><p /> <p><img href=\"02000004.jpg\" /></p>", xml);
assertContains("<table><tbody>", xml);
assertContains("</tbody></table>", xml);
- assertContains("<td><p>R1 c1</p></td>", xml);
+ assertContains("<td><p>R1 c1</p> </td>", xml);
assertContains("<a href=\"https://tika.apache.org/\">tika</a>", xml);
assertContains("footnote", xml);
assertContains("Mycomment", xml);
@@ -61,6 +71,14 @@ public class XML2003ParserTest extends TikaTest {
//make sure embedded docs were properly processed
assertContains("moscow-birds",
Arrays.asList(list.get(7).getValues(TikaCoreProperties.KEYWORDS)));
+
+ //check that text is extracted with breaks between elements
+ String txt = getText(getResourceAsStream("/test-documents/testWORD2003.xml"), new AutoDetectParser());
+ txt = txt.replaceAll("\\s+", " ");
+ assertNotContained("beforeR1", txt);
+ assertContains("R1 c1 R1 c2", txt);
+ assertNotContained("footnoteFigure", txt);
+ assertContains("footnote Figure", txt);
}
@Test
@@ -72,10 +90,19 @@ public class XML2003ParserTest extends TikaTest {
assertEquals("application/vnd.ms-spreadsheetml", m.get(Metadata.CONTENT_TYPE));
String xml = r.xml;
- assertContains("<tr><td>Col1</td><td>Col2</td>", xml);
+ xml = xml.replaceAll("\\s+", " ");
+ //confirm metadata was dumped to xml
+ assertContains("<meta name=\"cp:version\" content=\"16.00\" />", xml);
+ assertContains("<tr> <td>Col1</td> <td>Col2</td>", xml);
assertContains("<td>2016-04-27T00:00:00.000</td>", xml);
assertContains("<a href=\"https://tika.apache.org/\">tika_hyperlink</a>", xml);
assertContains("<td>5.5</td>", xml);
+
+ //check that text is extracted with breaks between elements
+ String txt = getText(getResourceAsStream("/test-documents/testEXCEL2003.xml"), new AutoDetectParser());
+ txt = txt.replaceAll("\\s+", " ");
+ assertContains("Col1 Col2 Col3 Col4 string 1 1.10", txt);
+
}
}
[2/2] tika git commit: Merge remote-tracking branch 'origin/master'
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/81279a1e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/81279a1e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/81279a1e
Branch: refs/heads/master
Commit: 81279a1e08df3ecd430e979f35460630cdc1bc4f
Parents: 7ae760e 52ea9ba
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 09:58:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 09:58:27 2016 -0400
----------------------------------------------------------------------
.../org/apache/tika/mime/tika-mimetypes.xml | 3 ++-
.../java/org/apache/tika/mime/TestMimeTypes.java | 5 +++++
.../resources/test-documents/testEXCEL_poi.xlsx | Bin 0 -> 3360 bytes
3 files changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------