You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/30 20:25:15 UTC

[4/6] tika git commit: TIKA-1321 -- add SAX based docx parser and integrate it with the recent 2006ml parser work -- initial commit

http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
deleted file mode 100644
index 607e6ef..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
-import org.junit.Test;
-
-
-public class Word2006MLParserTest extends TikaTest {
-
-    @Test
-    public void basicTest() throws Exception {
-
-
-
-        List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml");
-
-        assertEquals(5, metadataList.size());
-
-        Metadata m = metadataList.get(0);
-
-        assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.CREATED));
-        assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.MODIFIED));
-        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
-        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
-        assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
-        assertEquals("Allison, Timothy B.", m.get(OfficeOpenXMLCore.LAST_MODIFIED_BY));
-        assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
-        assertEquals("225", m.get(Office.WORD_COUNT));
-        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
-        assertEquals("1506", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
-        assertEquals("10", m.get(Office.LINE_COUNT));
-        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
-
-
-        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
-
-
-        assertContainsCountTimes("engaging title page", content, 1);
-        assertContainsCountTimes("<p>This is the Author</p>", content, 1);
-        assertContainsCountTimes("<p>This is an engaging title page</p>", content, 1);
-
-        assertContains("<p>My Document Title</p>", content);
-        assertContains("<p>My Document Subtitle</p>", content);
-
-        assertContains("<p>\tHeading1\t3</p>", content);
-
-
-        //TODO: integrate numbering
-        assertContains("Really basic 2.", content);
-
-        assertContainsCountTimes("This is a text box", content, 1);
-
-        assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
-
-        assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
-
-        assertContains("<p>This is          10 spaces</p>", content);
-
-        //caption
-        assertContains("<p>Table 1: Table1 Caption</p>", content);
-
-        //embedded table
-        //TODO: figure out how to handle embedded tables in html
-        assertContains("<p>Embedded table r1c1</p>", content);
-
-        //shape
-        assertContainsCountTimes("<p>This is text within a shape", content, 1);
-
-        //sdt rich text
-        assertContains("<p>Rich text content control", content);
-
-        //sdt simple text
-        assertContains("<p>Simple text content control", content);
-
-        //sdt repeating
-        assertContains("Repeating content", content);
-
-        //sdt dropdown
-        //TODO: get options for dropdown
-        assertContains("Drop down1", content);
-
-        //sdt date
-        assertContains("<p>11/16/2016</p>", content);
-
-        //test that <tab/> works
-        assertContains("tab\ttab", content);
-
-        assertContainsCountTimes("serious word art", content, 1);
-        assertContainsCountTimes("Wordartr1c1", content, 1);
-
-        //glossary document contents
-        assertContains("Click or tap to enter a date", content);
-
-        //basic formatting
-        assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
-                content);
-
-        //TODO: add chart parsing
-//        assertContains("This is the chart", content);
-
-        assertContains("This is a comment", content);
-
-        assertContains("This is an endnote", content);
-
-        assertContains("this is the footnote", content);
-
-        assertContains("First page header", content);
-
-        assertContains("Even page header", content);
-
-        assertContains("Odd page header", content);
-
-        assertContains("First page footer", content);
-
-        assertContains("Even page footer", content);
-
-        assertContains("Odd page footer", content);
-
-        //test default includes deleted
-        assertContains("frog", content);
-
-        assertContains("Mattmann", content);
-
-        //TODO: extract this...Note that it is in "Backup" not "Choice"!!!
-//        assertContains("This is the chart title", content);
-
-
-
-    }
-
-    private void assertContainsCountTimes(String needle, String haystack, int expectedCount) {
-        int i = haystack.indexOf("engaging title page");
-        int cnt = 0;
-        while (i > -1) {
-            cnt++;
-            i = haystack.indexOf("engaging title page", i+1);
-        }
-        assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount,
-                expectedCount, cnt);
-
-    }
-
-    @Test
-    public void testSkipDeleted() throws Exception {
-        ParseContext pc = new ParseContext();
-        MSOfficeParserConfig msOfficeParserConfig = new MSOfficeParserConfig();
-        msOfficeParserConfig.setIncludeDeletedContent(false);
-        pc.set(MSOfficeParserConfig.class, msOfficeParserConfig);
-
-        XMLResult r = getXML("testWORD_2006ml.xml", pc);
-        assertNotContained("frog", r.xml);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/d19e4725/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
new file mode 100644
index 0000000..79f1890
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParserTest.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.junit.Test;
+
+
+public class Word2006MLParserTest extends TikaTest {
+
+    @Test
+    public void basicTest() throws Exception {
+
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml");
+
+        assertEquals(9, metadataList.size());
+
+        Metadata m = metadataList.get(0);
+
+        assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.CREATED));
+        assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
+        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
+        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
+        assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
+        assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
+        assertEquals("260", m.get(Office.WORD_COUNT));
+        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
+        assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
+        assertEquals("12", m.get(Office.LINE_COUNT));
+        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
+
+
+        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+
+        assertContainsCount("This is the Author", content, 1);
+        assertContainsCount("This is an engaging title page", content, 1);
+
+        assertContains("My Document Title", content);
+        assertContains("My Document Subtitle", content);
+
+        assertContains("<p>\tHeading1\t3</p>", content);
+
+
+        //TODO: integrate numbering
+        assertContains("Really basic 2.", content);
+
+        assertContainsCount("This is a text box", content, 1);
+
+//        assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
+
+//        assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
+
+        assertContains("<p>This is          10 spaces</p>", content);
+
+        //caption
+        assertContains("<p>Table 1: Table1 Caption</p>", content);
+
+        //embedded table
+        //TODO: figure out how to handle embedded tables in html
+        assertContains("<p>Embedded table r1c1</p>", content);
+
+        //shape
+        assertContainsCount("<p>This is text within a shape", content, 1);
+
+        //sdt rich text
+        assertContains("<p>Rich text content control", content);
+
+        //sdt simple text
+        assertContains("<p>Simple text content control", content);
+
+        //sdt repeating
+        assertContains("Repeating content", content);
+
+        //sdt dropdown
+        //TODO: get options for dropdown
+        assertContains("Drop down1", content);
+
+        //sdt date
+        assertContains("<p>11/16/2016</p>", content);
+
+        //test that <tab/> works
+        assertContains("tab\ttab", content);
+
+        assertContainsCount("serious word art", content, 1);
+        assertContainsCount("Wordartr1c1", content, 1);
+
+        //glossary document contents
+        assertContains("Click or tap to enter a date", content);
+
+        //basic formatting
+        assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
+                content);
+
+        //TODO: add chart parsing
+//        assertContains("This is the chart", content);
+
+        assertContains("This is a comment", content);
+
+        assertContains("This is an endnote", content);
+
+        assertContains("this is the footnote", content);
+
+        assertContains("First page header", content);
+
+        assertContains("Even page header", content);
+
+        assertContains("Odd page header", content);
+
+        assertContains("First page footer", content);
+
+        assertContains("Even page footer", content);
+
+        assertContains("Odd page footer", content);
+
+        //test default includes deleted
+        assertContains("frog", content);
+
+        assertContains("Mattmann", content);
+
+        //test default -- do not include moveFrom
+        assertContainsCount("Second paragraph", content, 1);
+
+        //TODO: figure out how to get this
+        //assertContains("This is the chart title", content);
+
+    }
+
+    @Test
+    public void testSkipDeletedAndMoveFrom() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeDeletedContent(false);
+        officeParserConfig.setIncludeMoveFromContent(true);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+
+        XMLResult r = getXML("testWORD_2006ml.xml", pc);
+        assertNotContained("frog", r.xml);
+        assertContainsCount("Second paragraph", r.xml, 2);
+
+    }
+
+
+}