You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/05/28 19:28:41 UTC
svn commit: r1682287 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/
main/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/parser/microsoft/ooxml/ test/res...
Author: tallison
Date: Thu May 28 17:28:40 2015
New Revision: 1682287
URL: http://svn.apache.org/r1682287
Log:
TIKA-1315 -- basic list support for WordExtractor; still need to add in override behavior once we add a class to ooxml via POI
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+ private final static String BULLET = "\u00b7";
+
+ protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+ protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+ //helper class that is docx/doc format agnostic
+ protected class ParagraphLevelCounter {
+
+ Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+
+ //counts can == 0 if the format is decimal, make sure
+ //that flag values are < 0
+ private final Integer NOT_SEEN_YET = -1;
+ private final Integer FIRST_SKIPPED = -2;
+ private List<Integer> counts = new ArrayList<Integer>();
+ private final LevelTuple[] levelTuples;
+
+ private int lastLevel = -1;
+
+ public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+ this.levelTuples = levelTuples;
+ }
+
+ public int getNumberOfLevels() {
+ return levelTuples.length;
+ }
+ /**
+ * Apply this to every numbered paragraph in order.
+ *
+ * @param levelNumber level number that is being incremented
+ * @return the new formatted number string for this level
+ */
+ public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+ for (int i = lastLevel+1; i < levelNumber; i++) {
+ if (i >= counts.size()){
+ int val = getStart(i, overrideLevelTuples);
+ counts.add(i, val);
+ } else {
+ int count = counts.get(i);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(i, overrideLevelTuples);
+ counts.set(i, count);
+ }
+ }
+ }
+
+ if (levelNumber < counts.size()) {
+ resetAfter(levelNumber, overrideLevelTuples);
+ int count = counts.get(levelNumber);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(levelNumber, overrideLevelTuples);
+ } else {
+ count++;
+ }
+ counts.set(levelNumber, count);
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ /**
+ * @param level which level to format
+ * @return the string that represents the number and the surrounding text for this paragraph
+ */
+ private String format(int level, LevelTuple[] overrideLevelTuples) {
+ if (level < 0 || level >= levelTuples.length) {
+ //log?
+ return "";
+ }
+ boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+ //short circuit bullet
+ String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+ if ("bullet".equals(numFmt)) {
+ return BULLET+" ";
+ }
+
+ String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+ levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+ StringBuilder sb = new StringBuilder();
+ Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+ int last = 0;
+ while (m.find()) {
+ sb.append(lvlText.substring(last, m.start()));
+ String lvlString = m.group(1);
+ int lvlNum = -1;
+ try {
+ lvlNum = Integer.parseInt(lvlString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ String numString = "";
+ //need to subtract 1 because, e.g. %1 is the format
+ //for the number at array offset 0
+ numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+ sb.append(numString);
+ last = m.end();
+ }
+ sb.append(lvlText.substring(last));
+ if (sb.length() > 0) {
+ //TODO: add in character after number
+ sb.append(" ");
+ }
+ return sb.toString();
+ }
+
+ //actual level number
+ private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+ int numFmtStyle = 0;
+ String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+ int count = getCount(lvlNum);
+ if (count < 0) {
+ count = 1;
+ }
+ if ("lowerLetter".equals(numFmt)) {
+ numFmtStyle = 4;
+ } else if ("lowerRoman".equals(numFmt)) {
+ numFmtStyle = 2;
+ } else if ("decimal".equals(numFmt)) {
+ numFmtStyle = 0;
+ } else if ("upperLetter".equals(numFmt)) {
+ numFmtStyle = 3;
+ } else if ("upperRoman".equals(numFmt)) {
+ numFmtStyle = 1;
+ } else if ("bullet".equals(numFmt)) {
+ return "";
+ //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+ } else if ("ordinal".equals(numFmt)) {
+ return ordinalize(count);
+ } else if ("decimalZero".equals(numFmt)) {
+ return "0"+NumberFormatter.getNumber(count, 0);
+ } else if ("none".equals(numFmt)) {
+ return "";
+ }
+ return NumberFormatter.getNumber(count, numFmtStyle);
+ }
+
+ private String ordinalize(int count) {
+ //this is only good for locale == English
+ String countString = Integer.toString(count);
+ if (countString.endsWith("1")) {
+ return countString+"st";
+ } else if (countString.endsWith("2")) {
+ return countString+"nd";
+ } else if (countString.endsWith("3")) {
+ return countString+"rd";
+ }
+ return countString+"th";
+ }
+
+ private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+ if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+ //log?
+ return "decimal";
+ }
+ if (isLegal) {
+ //return decimal no matter the level if isLegal is true
+ return "decimal";
+ }
+ return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+ levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+ }
+
+ private int getCount(int lvlNum) {
+ if (lvlNum < 0 || lvlNum >= counts.size()) {
+ //log?
+ return 1;
+ }
+ return counts.get(lvlNum);
+ }
+
+ private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+ for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+ int cnt = counts.get(levelNumber);
+ if (cnt == NOT_SEEN_YET) {
+ //do nothing
+ } else if (cnt == FIRST_SKIPPED) {
+ //do nothing
+ } else if (levelTuples.length > levelNumber) {
+ //never reset if restarts == 0
+ int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+ levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+ if (restart == 0) {
+ return;
+ } else if (restart == -1 ||
+ startlevelNumber <= restart - 1 ) {
+ counts.set(levelNumber, NOT_SEEN_YET);
+ } else {
+ //do nothing/don't reset
+ }
+ } else {
+ //reset!
+ counts.set(levelNumber, NOT_SEEN_YET);
+ }
+ }
+ }
+
+ private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+ if (levelNumber >= levelTuples.length) {
+ return 1;
+ } else {
+ return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+ levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+ }
+ }
+ }
+
+ protected class LevelTuple {
+ private final int start;
+ private final int restart;
+ private final String lvlText;
+ private final String numFmt;
+ private final boolean isLegal;
+
+ public LevelTuple(String lvlText) {
+ this.lvlText = lvlText;
+ start = 1;
+ restart = -1;
+ numFmt = "decimal";
+ isLegal = false;
+ }
+
+ public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+ this.start = start;
+ this.restart = restart;
+ this.lvlText = lvlText;
+ this.numFmt = numFmt;
+ this.isLegal = isLegal;
+ }
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
------------------------------------------------------------------------------
svn:eol-style = LF
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListData;
+import org.apache.poi.hwpf.model.ListFormatOverrideLevel;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+
+/**
+ * Computes the number text which goes at the beginning of each list paragraph
+ * <p/>
+ * <p><em>Note:</em> This class only handles the raw number text and does not apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p>
+ * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p>
+ * <p>Further, this class does not yet handle overrides</p>
+ */
+public class ListManager extends AbstractListManager {
+
+ private static final Log logger = LogFactory.getLog(ListManager.class);
+ private final ListTables listTables;
+
+ /**
+ * Ordinary constructor for a new list reader
+ *
+ * @param document Document to process
+ */
+ public ListManager(final HWPFDocument document) {
+ this.listTables = document.getListTables();
+ }
+
+ /**
+ * Get the formatted number for a given paragraph
+ * <p/>
+ * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> paragraphs in a valid selection (main document, text field, ...) which are part of a list.</p>
+ *
+ * @param paragraph list paragraph to process
+ * @return String which represents the numbering of this list paragraph; never {@code null}
+ * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of a list
+ * @throws IllegalStateException If problems with the document are encountered
+ */
+ public String getFormattedNumber(final Paragraph paragraph) {
+ if (paragraph == null) throw new IllegalArgumentException("Given paragraph cannot be null.");
+ if (!paragraph.isInList()) throw new IllegalArgumentException("Can only process list paragraphs.");
+ //lsid is equivalent to docx's abnum
+ //ilfo is equivalent to docx's num
+ int currAbNumId = paragraph.getList().getLsid();
+ int currNumId = paragraph.getIlfo();
+ ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+ LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+
+ if (lc == null) {
+ ListData listData = listTables.getListData(paragraph.getList().getLsid());
+ LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length];
+ for (int i = 0; i < listData.getLevels().length; i++) {
+ levelTuples[i] = buildTuple(i,listData.getLevels()[i]);
+ }
+ lc = new ParagraphLevelCounter(levelTuples);
+ }
+ if (overrideTuples == null) {
+ overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels());
+ }
+ String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples);
+
+ listLevelMap.put(currAbNumId, lc);
+ overrideTupleMap.put(currNumId, overrideTuples);
+ return formattedString;
+ }
+
+ private LevelTuple buildTuple(int i, ListLevel listLevel) {
+ boolean isLegal = false;
+ int start = 1;
+ int restart = -1;
+ String lvlText = "%"+i+".";
+ String numFmt = "decimal";
+
+ start = listLevel.getStartAt();
+ restart = listLevel.getRestart();
+ isLegal = listLevel.isLegalNumbering();
+ numFmt = convertToNewNumFormat(listLevel.getNumberFormat());
+ lvlText = convertToNewNumberText(listLevel.getNumberText(), listLevel.getLevelNumberingPlaceholderOffsets());
+ return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+ }
+
+ private LevelTuple[] buildOverrideTuples(Paragraph par, int length) {
+ ListFormatOverrideLevel overrideLevel;
+ // find the override for this level
+ if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) {
+ return null;
+ }
+ overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0];
+ if (overrideLevel == null) {
+ return null;
+ }
+ LevelTuple[] levelTuples = new LevelTuple[length];
+ ListLevel listLevel = overrideLevel.getLevel();
+ if (listLevel == null) {
+ return null;
+ }
+ for (int i = 0; i < length; i++) {
+ levelTuples[i] = buildTuple(i, listLevel);
+ }
+
+ return levelTuples;
+
+ }
+
+ private String convertToNewNumberText(String numberText, byte[] numberOffsets) {
+
+ StringBuilder sb = new StringBuilder();
+ int last = 0;
+ for (int i = 0; i < numberOffsets.length;i++) {
+ int offset = (int)numberOffsets[i];
+
+ if (offset == 0){
+ break;
+ }
+ sb.append(numberText.substring(last, offset-1));
+ //need to add one because newer format
+ //adds one. In .doc, this was the array index;
+ //but in .docx, this is the level number
+ int lvlNum = (int)numberText.charAt(offset-1)+1;
+ sb.append("%"+lvlNum);
+ last = offset;
+ }
+ if (last < numberText.length()) {
+ sb.append(numberText.substring(last));
+ }
+ return sb.toString();
+ }
+
+ private String convertToNewNumFormat(int numberFormat) {
+ switch (numberFormat) {
+ case -1 :
+ return "none";
+ case 0 :
+ return "decimal";
+ case 1 :
+ return "upperRoman";
+ case 2 :
+ return "lowerRoman";
+ case 3 :
+ return "upperLetter";
+ case 4 :
+ return "lowerLetter";
+ case 5 :
+ return "ordinal";
+ case 22 :
+ return "decimalZero";
+ case 23 :
+ return "bullet";
+ case 47 :
+ return "none";
+ default :
+ //do we really want to silently swallow these uncovered cases?
+ throw new RuntimeException("NOT COVERED: "+numberFormat);
+ }
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
------------------------------------------------------------------------------
svn:eol-style = LF
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu May 28 17:28:40 2015
@@ -58,6 +58,8 @@ public class WordExtractor extends Abstr
private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+ // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+ private static final String LIST_DELIMITER = " ";
public WordExtractor(ParseContext context) {
super(context);
@@ -101,9 +103,10 @@ public class WordExtractor extends Abstr
// Do the main paragraph text
Range r = document.getRange();
+ ListManager listManager = new ListManager(document);
for(int i=0; i<r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
- i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
+ i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
}
// Do everything else
@@ -162,13 +165,14 @@ public class WordExtractor extends Abstr
throws SAXException, IOException, TikaException {
if (countParagraphs(ranges) > 0) {
xhtml.startElement("div", "class", type);
+ ListManager listManager = new ListManager(document);
for (Range r : ranges) {
if (r != null) {
for(int i=0; i<r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
i += handleParagraph(p, 0, r, document,
- FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
+ FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml);
}
}
}
@@ -177,7 +181,7 @@ public class WordExtractor extends Abstr
}
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
- FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
+ FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager,
XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
// Note - a poi bug means we can't currently properly recurse
// into nested tables, so currently we don't
@@ -194,7 +198,7 @@ public class WordExtractor extends Abstr
for(int pn=0; pn<cell.numParagraphs(); pn++) {
Paragraph cellP = cell.getParagraph(pn);
- handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml);
+ handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
}
xhtml.endElement("td");
}
@@ -212,11 +216,15 @@ public class WordExtractor extends Abstr
}
TagAndStyle tas;
+ String numbering = null;
if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
StyleDescription style =
document.getStyleSheet().getStyleDescription(p.getStyleIndex());
if (style != null && style.getName() != null && style.getName().length() > 0) {
+ if (p.isInList()) {
+ numbering = listManager.getFormattedNumber(p);
+ }
tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
} else {
tas = new TagAndStyle("p", null);
@@ -231,6 +239,10 @@ public class WordExtractor extends Abstr
xhtml.startElement(tas.getTag());
}
+ if (numbering != null) {
+ xhtml.characters(numbering);
+ }
+
for(int j=0; j<p.numCharacterRuns(); j++) {
CharacterRun cr = p.getCharacterRun(j);
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.parser.microsoft.AbstractListManager;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+
+
+public class XWPFListManager extends AbstractListManager {
+ private final static boolean OVERRIDE_AVAILABLE;
+ private final static String SKIP_FORMAT = Character.toString((char)61623);//if this shows up as the lvlText, don't show a number
+
+ private final XWPFNumbering numbering;
+ //map of numId (which paragraph series is this a member of?), levelcounts
+ public XWPFListManager(XWPFDocument document) {
+ numbering = document.getNumbering();
+ }
+ static {
+ boolean b = false;
+ try {
+ Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl");
+ b = true;
+ } catch (ClassNotFoundException e) {
+ }
+ b = OVERRIDE_AVAILABLE = false;
+
+ }
+ public String getFormattedNumber(final XWPFParagraph paragraph) {
+ int currNumId = paragraph.getNumID().intValue();
+ CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum();
+ CTDecimalNumber abNum = ctNum.getAbstractNumId();
+ int currAbNumId = abNum.getVal().intValue();
+
+ ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+ LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+ if (lc == null) {
+ lc = loadLevelTuples(abNum);
+ }
+ if (overrideTuples == null) {
+ overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+ }
+
+ String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+
+ listLevelMap.put(currAbNumId, lc);
+ overrideTupleMap.put(currNumId, overrideTuples);
+
+ return formattedString;
+ }
+
+ /**
+ * WARNING: currently always returns null.
+ * TODO: Once CTNumLvl is available to Tika,
+ * we can turn this back on.
+ * @param ctNum number on which to build the overrides
+ * @param length length of intended array
+ * @return null or an array of override tuples of length {@param length}
+ */
+ private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+ return null;
+/* LevelTuple[] levelTuples = new LevelTuple[length];
+ int overrideLength = ctNum.sizeOfLvlOverrideArray();
+ if (overrideLength == 0) {
+ return null;
+ }
+ for (int i = 0; i < length; i++) {
+ LevelTuple tuple;
+ if (i >= overrideLength) {
+ tuple = new LevelTuple("%"+i+".");
+ } else {
+ CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+ if (ctNumLvl != null) {
+ tuple = buildTuple(i, ctNumLvl.getLvl());
+ } else {
+ tuple = new LevelTuple("%"+i+".");
+ }
+ }
+ levelTuples[i] = tuple;
+ }
+ return levelTuples;*/
+ }
+
+
+ private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+ //Unfortunately, we need to go this far into the underlying structure
+ //to get the abstract num information for the edge case where
+ //someone skips a level and the format is not context-free, e.g. "1.B.i".
+ XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+ CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+
+ LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+ for (int i = 0; i < levels.length; i++) {
+ levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+ }
+ return new ParagraphLevelCounter(levels);
+ }
+
+ private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+ boolean isLegal = false;
+ int start = 1;
+ int restart = -1;
+ String lvlText = "%"+level+".";
+ String numFmt = "decimal";
+
+
+ if (ctLvl != null && ctLvl.getIsLgl() != null) {
+ isLegal = true;
+ }
+
+ if (ctLvl != null && ctLvl.getNumFmt() != null &&
+ ctLvl.getNumFmt().getVal() != null) {
+ numFmt = ctLvl.getNumFmt().getVal().toString();
+ }
+ if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+ ctLvl.getLvlRestart().getVal() != null) {
+ restart = ctLvl.getLvlRestart().getVal().intValue();
+ }
+ if (ctLvl != null && ctLvl.getStart() != null &&
+ ctLvl.getStart().getVal() != null) {
+ start = ctLvl.getStart().getVal().intValue();
+ } else {
+
+ //this is a hack. Currently, this gets the lowest possible
+ //start for a given numFmt. We should probably try to grab the
+ //restartNumberingAfterBreak value in
+ //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">???
+ if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || "decimalZero".equals(numFmt)) {
+ start = 0;
+ } else {
+ start = 1;
+ }
+ }
+ if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) {
+ lvlText = ctLvl.getLvlText().getVal();
+ }
+ return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
------------------------------------------------------------------------------
svn:eol-style = LF
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Thu May 28 17:28:40 2015
@@ -61,6 +61,11 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+ // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+ private static final String LIST_DELIMITER = " ";
+
+
private XWPFDocument document;
private XWPFStyles styles;
@@ -78,31 +83,32 @@ public class XWPFWordExtractorDecorator
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
-
+ XWPFListManager listManager = new XWPFListManager(document);
// headers
if (hfPolicy!=null) {
- extractHeaders(xhtml, hfPolicy);
+ extractHeaders(xhtml, hfPolicy, listManager);
}
// process text in the order that it occurs in
- extractIBodyText(document, xhtml);
+ extractIBodyText(document, listManager, xhtml);
// then all document tables
if (hfPolicy!=null) {
- extractFooters(xhtml, hfPolicy);
+ extractFooters(xhtml, hfPolicy, listManager);
}
}
- private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
+ private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
for(IBodyElement element : bodyElement.getBodyElements()) {
if(element instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph)element;
- extractParagraph(paragraph, xhtml);
+ extractParagraph(paragraph, listManager, xhtml);
}
if(element instanceof XWPFTable) {
XWPFTable table = (XWPFTable)element;
- extractTable(table, xhtml);
+ extractTable(table, listManager, xhtml);
}
if (element instanceof XWPFSDT){
extractSDT((XWPFSDT) element, xhtml);
@@ -120,7 +126,8 @@ public class XWPFWordExtractorDecorator
xhtml.endElement(tag);
}
- private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
+ private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
// If this paragraph is actually a whole new section, then
// it could have its own headers and footers
@@ -131,7 +138,7 @@ public class XWPFWordExtractorDecorator
if(ctSectPr != null) {
headerFooterPolicy =
new XWPFHeaderFooterPolicy(document, ctSectPr);
- extractHeaders(xhtml, headerFooterPolicy);
+ extractHeaders(xhtml, headerFooterPolicy, listManager);
}
}
@@ -158,6 +165,7 @@ public class XWPFWordExtractorDecorator
xhtml.startElement(tag, "class", styleClass);
}
+ writeParagraphNumber(paragraph, listManager, xhtml);
// Output placeholder for any embedded docs:
// TODO: replace w/ XPath/XQuery:
@@ -234,17 +242,30 @@ public class XWPFWordExtractorDecorator
// Also extract any paragraphs embedded in text boxes:
for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
- extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), xhtml);
+ extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
}
// Finish this paragraph
xhtml.endElement(tag);
if (headerFooterPolicy != null) {
- extractFooters(xhtml, headerFooterPolicy);
+ extractFooters(xhtml, headerFooterPolicy, listManager);
}
}
+ private void writeParagraphNumber(XWPFParagraph paragraph,
+ XWPFListManager listManager,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if (paragraph.getNumIlvl() == null) {
+ return;
+ }
+ String number = listManager.getFormattedNumber(paragraph);
+ if (number != null) {
+ xhtml.characters(number);
+ }
+
+ }
+
private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
TmpFormatting fmtg) throws SAXException {
// Close any still open style tags
@@ -328,7 +349,8 @@ public class XWPFWordExtractorDecorator
xhtml.characters(run.getContent().getText());
}
- private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
+ private void extractTable(XWPFTable table, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
xhtml.startElement("table");
xhtml.startElement("tbody");
@@ -337,7 +359,7 @@ public class XWPFWordExtractorDecorator
for(ICell cell : row.getTableICells()){
xhtml.startElement("td");
if (cell instanceof XWPFTableCell) {
- extractIBodyText((XWPFTableCell)cell, xhtml);
+ extractIBodyText((XWPFTableCell)cell, listManager, xhtml);
} else if (cell instanceof XWPFSDTCell) {
xhtml.characters(((XWPFSDTCell)cell).getContent().getText());
}
@@ -350,45 +372,46 @@ public class XWPFWordExtractorDecorator
}
private void extractFooters(
- XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
+ XWPFListManager listManager)
throws SAXException, XmlException, IOException {
// footers
if (hfPolicy.getFirstPageFooter() != null) {
- extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
+ extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
}
if (hfPolicy.getEvenPageFooter() != null) {
- extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
+ extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
}
if (hfPolicy.getDefaultFooter() != null) {
- extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
+ extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
}
}
private void extractHeaders(
- XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager)
throws SAXException, XmlException, IOException {
if (hfPolicy == null) return;
if (hfPolicy.getFirstPageHeader() != null) {
- extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
+ extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
}
if (hfPolicy.getEvenPageHeader() != null) {
- extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
+ extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
}
if (hfPolicy.getDefaultHeader() != null) {
- extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
+ extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
}
}
- private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header) throws SAXException, XmlException, IOException {
+ private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException {
for (IBodyElement e : header.getBodyElements()){
if (e instanceof XWPFParagraph){
- extractParagraph((XWPFParagraph)e, xhtml);
+ extractParagraph((XWPFParagraph)e, listManager, xhtml);
} else if (e instanceof XWPFTable){
- extractTable((XWPFTable)e, xhtml);
+ extractTable((XWPFTable)e, listManager, xhtml);
} else if (e instanceof XWPFSDT){
extractSDT((XWPFSDT)e, xhtml);
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu May 28 17:28:40 2015
@@ -424,4 +424,78 @@ public class WordParserTest extends Tika
XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
assertContains("href=\"http://www.nib.org", result.xml);
}
+
+ @Test
+ public void testDOCParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_numbered_list.doc").xml;
+ assertContains("1) This", xml);
+ assertContains("a) Is", xml);
+ assertContains("i) A multi", xml);
+ assertContains("ii) Level", xml);
+ assertContains("1. Within cell 1", xml);
+ assertContains("b. Cell b", xml);
+ assertContains("iii) List", xml);
+ assertContains("2) foo", xml);
+ assertContains("ii) baz", xml);
+ assertContains("ii) foo", xml);
+ assertContains("II. bar", xml);
+ assertContains("6. six", xml);
+ assertContains("7. seven", xml);
+ assertContains("a. seven a", xml);
+ assertContains("e. seven e", xml);
+ assertContains("2. A ii 2", xml);
+ assertContains("3. page break list 3", xml);
+ assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+ assertContains("1.1.1. 1.1.1", xml);
+ assertContains("1.1. 1.2->1.1 //set the value", xml);
+
+ assertContains("add a list here", xml);
+ //TODO: not currently pulling numbers out of comments
+ assertContains(">comment list 1", xml);
+
+ }
+
+ @Test
+ public void testDOCOverrideParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_override_list_numbering.doc").xml;
+
+ //Test 1
+ assertContains("1.1.1.1...1 1.1.1.1...1", xml);
+ assertContains("1st.2.3someText 1st.2.3someText", xml);
+ assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+ assertContains("5th 5th", xml);
+
+
+ //Test 2
+ assertContains("1.a.I 1.a.I", xml);
+ //test no reset because level 2 is not sufficient to reset
+ assertContains("1.b.III 1.b.III", xml);
+ //test restarted because of level 0's increment to 2
+ assertContains("2.a.I 2.a.I", xml);
+ //test handling of skipped level
+ assertContains("2.b 2.b", xml);
+
+ //Test 3
+ assertContains("(1)) (1))", xml);
+ //tests start level 1 at 17 and
+ assertContains("2.17 2.17", xml);
+ //tests that isLegal turns everything into decimal
+ assertContains("2.18.2.1 2.18.2.1", xml);
+ assertContains(">2 2", xml);
+
+ //Test4
+ assertContains(">1 1", xml);
+ assertContains(">A A", xml);
+ assertContains(">B B", xml);
+ assertContains(">C C", xml);
+ assertContains(">4 4", xml);
+
+ //Test5
+ assertContains(">00 00", xml);
+ assertContains(">01 01", xml);
+ assertContains(">01. 01.", xml);
+ assertContains(">01..1 01..1", xml);
+ assertContains(">02 02", xml);
+ }
}
+
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu May 28 17:28:40 2015
@@ -1178,5 +1178,78 @@ public class OOXMLParserTest extends Tik
}
}
+
+ @Test
+ public void testDOCXParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_numbered_list.docx").xml;
+ assertContains("1) This", xml);
+ assertContains("a) Is", xml);
+ assertContains("i) A multi", xml);
+ assertContains("ii) Level", xml);
+ assertContains("1. Within cell 1", xml);
+ assertContains("b. Cell b", xml);
+ assertContains("iii) List", xml);
+ assertContains("2) foo", xml);
+ assertContains("ii) baz", xml);
+ assertContains("ii) foo", xml);
+ assertContains("II. bar", xml);
+ assertContains("6. six", xml);
+ assertContains("7. seven", xml);
+ assertContains("a. seven a", xml);
+ assertContains("e. seven e", xml);
+ assertContains("2. A ii 2", xml);
+ assertContains("3. page break list 3", xml);
+ assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+ assertContains("1.1.1. 1.1.1", xml);
+ assertContains("1.1. 1.2->1.1 //set the value", xml);
+
+ //TODO: comment is not being extracted!
+ //assertContains("add a list here", xml);
+ }
+
+ @Test
+ public void testDOCXOverrideParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_override_list_numbering.docx").xml;
+
+ //Test 1
+ assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml);
+ assertContains("1st.2.3someText 1st.2.3someText", xml);
+ assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+ assertContains("5th 5th", xml);
+
+
+ //Test 2
+ assertContains("1.a.I 1.a.I", xml);
+ //test no reset because level 2 is not sufficient to reset
+ assertContains("<p>1.b.III 1.b.III</p>", xml);
+ //test restarted because of level 0's increment to 2
+ assertContains("2.a.I 2.a.I", xml);
+ //test handling of skipped level
+ assertContains("<p>2.b 2.b</p>", xml);
+
+ //Test 3
+ assertContains("(1)) (1))", xml);
+ //tests start level 1 at 17 and
+ assertContains("2.17 2.17", xml);
+ //tests that isLegal turns everything into decimal
+ assertContains("2.18.2.1 2.18.2.1", xml);
+ assertContains("<p>2 2</p>", xml);
+
+ //Test4
+ assertContains("<p>1 1</p>", xml);
+ assertContains("<p>A A</p>", xml);
+ assertContains("<p>B B</p>", xml);
+ //TODO: add this back in once overrides are available via CTNumLvl
+ //assertContains("<p>C C</p>", xml);
+ assertContains("<p>4 4</p>", xml);
+
+ //Test5
+ assertContains(">00 00", xml);
+ assertContains(">01 01", xml);
+ assertContains(">01. 01.", xml);
+ assertContains(">01..1 01..1", xml);
+ assertContains(">02 02", xml);
+ }
}
+
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream