You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/05/28 19:28:41 UTC

svn commit: r1682287 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ main/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/res...

Author: tallison
Date: Thu May 28 17:28:40 2015
New Revision: 1682287

URL: http://svn.apache.org/r1682287
Log:
TIKA-1315 -- basic list support for WordExtractor; still need to add in override behavior once we add a class to ooxml via POI

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java   (with props)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java   (with props)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+    private final static String BULLET = "\u00b7";
+
+    protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+    protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+    //helper class that is docx/doc format agnostic
+    protected class ParagraphLevelCounter {
+
+        Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+
+        //counts can == 0 if the format is decimal, make sure
+        //that flag values are < 0
+        private final Integer NOT_SEEN_YET = -1;
+        private final Integer FIRST_SKIPPED = -2;
+        private List<Integer> counts = new ArrayList<Integer>();
+        private final LevelTuple[] levelTuples;
+
+        private int lastLevel = -1;
+
+        public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+            this.levelTuples = levelTuples;
+        }
+
+        public int getNumberOfLevels() {
+            return levelTuples.length;
+        }
+        /**
+         * Apply this to every numbered paragraph in order.
+         *
+         * @param levelNumber     level number that is being incremented
+         * @return the new formatted number string for this level
+         */
+        public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+            for (int i = lastLevel+1; i < levelNumber; i++) {
+                if (i >= counts.size()){
+                    int val = getStart(i, overrideLevelTuples);
+                    counts.add(i, val);
+                } else {
+                    int count = counts.get(i);
+                    if (count == NOT_SEEN_YET) {
+                        count = getStart(i, overrideLevelTuples);
+                        counts.set(i, count);
+                    }
+                }
+            }
+
+            if (levelNumber < counts.size()) {
+                resetAfter(levelNumber, overrideLevelTuples);
+                int count = counts.get(levelNumber);
+                if (count == NOT_SEEN_YET) {
+                    count = getStart(levelNumber, overrideLevelTuples);
+                } else {
+                    count++;
+                }
+                counts.set(levelNumber, count);
+                lastLevel = levelNumber;
+                return format(levelNumber, overrideLevelTuples);
+            }
+
+            counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+            lastLevel = levelNumber;
+            return format(levelNumber, overrideLevelTuples);
+        }
+
+        /**
+         * @param level which level to format
+         * @return the string that represents the number and the surrounding text for this paragraph
+         */
+        private String format(int level, LevelTuple[] overrideLevelTuples) {
+            if (level < 0 || level >= levelTuples.length) {
+                //log?
+                return "";
+            }
+            boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+            //short circuit bullet
+            String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+            if ("bullet".equals(numFmt)) {
+                return BULLET+" ";
+            }
+
+            String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+                    levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+            StringBuilder sb = new StringBuilder();
+            Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+            int last = 0;
+            while (m.find()) {
+                sb.append(lvlText.substring(last, m.start()));
+                String lvlString = m.group(1);
+                int lvlNum = -1;
+                try {
+                    lvlNum = Integer.parseInt(lvlString);
+                } catch (NumberFormatException e) {
+                    //swallow
+                }
+                String numString = "";
+                //need to subtract 1 because, e.g. %1 is the format
+                //for the number at array offset 0
+                numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+                sb.append(numString);
+                last = m.end();
+            }
+            sb.append(lvlText.substring(last));
+            if (sb.length() > 0) {
+                //TODO: add in character after number
+                sb.append(" ");
+            }
+            return sb.toString();
+        }
+
+        //actual level number
+        private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+            int numFmtStyle = 0;
+            String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+            int count = getCount(lvlNum);
+            if (count < 0) {
+                count = 1;
+            }
+            if ("lowerLetter".equals(numFmt)) {
+                numFmtStyle = 4;
+            } else if ("lowerRoman".equals(numFmt)) {
+                numFmtStyle = 2;
+            } else if ("decimal".equals(numFmt)) {
+                numFmtStyle = 0;
+            } else if ("upperLetter".equals(numFmt)) {
+                numFmtStyle = 3;
+            } else if ("upperRoman".equals(numFmt)) {
+                numFmtStyle = 1;
+            } else if ("bullet".equals(numFmt)) {
+                return "";
+                //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+            } else if ("ordinal".equals(numFmt)) {
+                return ordinalize(count);
+            } else if ("decimalZero".equals(numFmt)) {
+                return "0"+NumberFormatter.getNumber(count, 0);
+            } else if ("none".equals(numFmt)) {
+                return "";
+            }
+            return NumberFormatter.getNumber(count, numFmtStyle);
+        }
+
+        private String ordinalize(int count) {
+            //this is only good for locale == English
+            String countString = Integer.toString(count);
+            if (countString.endsWith("1")) {
+                return countString+"st";
+            } else if (countString.endsWith("2")) {
+                return countString+"nd";
+            } else if (countString.endsWith("3")) {
+                return countString+"rd";
+            }
+            return countString+"th";
+        }
+
+        private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+            if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+                //log?
+                return "decimal";
+            }
+            if (isLegal) {
+                //return decimal no matter the level if isLegal is true
+                return "decimal";
+            }
+            return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+                    levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+        }
+
+        private int getCount(int lvlNum) {
+            if (lvlNum < 0 || lvlNum >= counts.size()) {
+                //log?
+                return 1;
+            }
+            return counts.get(lvlNum);
+        }
+
+        private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+            for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+                int cnt = counts.get(levelNumber);
+                if (cnt == NOT_SEEN_YET) {
+                    //do nothing
+                } else if (cnt == FIRST_SKIPPED) {
+                    //do nothing
+                } else if (levelTuples.length > levelNumber) {
+                    //never reset if restarts == 0
+                    int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+                            levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+                    if (restart == 0) {
+                        return;
+                    } else if (restart == -1 ||
+                            startlevelNumber <= restart - 1 ) {
+                        counts.set(levelNumber, NOT_SEEN_YET);
+                    } else {
+                        //do nothing/don't reset
+                    }
+                } else {
+                    //reset!
+                    counts.set(levelNumber, NOT_SEEN_YET);
+                }
+            }
+        }
+
+        private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+            if (levelNumber >= levelTuples.length) {
+                return 1;
+            } else {
+                return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+                        levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+            }
+        }
+    }
+
+    protected class LevelTuple {
+        private final int start;
+        private final int restart;
+        private final String lvlText;
+        private final String numFmt;
+        private final boolean isLegal;
+
+        public LevelTuple(String lvlText) {
+            this.lvlText = lvlText;
+            start = 1;
+            restart = -1;
+            numFmt = "decimal";
+            isLegal = false;
+        }
+
+        public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+            this.start = start;
+            this.restart = restart;
+            this.lvlText = lvlText;
+            this.numFmt = numFmt;
+            this.isLegal = isLegal;
+        }
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
------------------------------------------------------------------------------
    svn:eol-style = LF

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListData;
+import org.apache.poi.hwpf.model.ListFormatOverrideLevel;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+
+/**
+ * Computes the number text which goes at the beginning of each list paragraph
+ * <p/>
+ * <p><em>Note:</em> This class only handles the raw number text and does not apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p>
+ * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p>
+ * <p>Further, this class does not yet handle overrides</p>
+ */
+public class ListManager extends AbstractListManager {
+
+    private static final Log logger = LogFactory.getLog(ListManager.class);
+    private final ListTables listTables;
+
+    /**
+     * Ordinary constructor for a new list reader
+     *
+     * @param document Document to process
+     */
+    public ListManager(final HWPFDocument document) {
+        this.listTables = document.getListTables();
+    }
+
+    /**
+     * Get the formatted number for a given paragraph
+     * <p/>
+     * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> paragraphs in a valid selection (main document, text field, ...) which are part of a list.</p>
+     *
+     * @param paragraph list paragraph to process
+     * @return String which represents the numbering of this list paragraph; never {@code null}
+     * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of a list
+     * @throws IllegalStateException    If problems with the document are encountered
+     */
+    public String getFormattedNumber(final Paragraph paragraph) {
+        if (paragraph == null) throw new IllegalArgumentException("Given paragraph cannot be null.");
+        if (!paragraph.isInList()) throw new IllegalArgumentException("Can only process list paragraphs.");
+        //lsid is equivalent to docx's abnum
+        //ilfo is equivalent to docx's num
+        int currAbNumId = paragraph.getList().getLsid();
+        int currNumId = paragraph.getIlfo();
+        ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+        LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+
+        if (lc == null) {
+            ListData listData = listTables.getListData(paragraph.getList().getLsid());
+            LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length];
+            for (int i = 0; i < listData.getLevels().length; i++) {
+                levelTuples[i] = buildTuple(i,listData.getLevels()[i]);
+            }
+            lc = new ParagraphLevelCounter(levelTuples);
+        }
+        if (overrideTuples == null) {
+            overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels());
+        }
+        String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples);
+
+        listLevelMap.put(currAbNumId, lc);
+        overrideTupleMap.put(currNumId, overrideTuples);
+        return formattedString;
+    }
+
+    private LevelTuple buildTuple(int i, ListLevel listLevel) {
+        boolean isLegal = false;
+        int start = 1;
+        int restart = -1;
+        String lvlText = "%"+i+".";
+        String numFmt = "decimal";
+
+        start = listLevel.getStartAt();
+        restart = listLevel.getRestart();
+        isLegal = listLevel.isLegalNumbering();
+        numFmt = convertToNewNumFormat(listLevel.getNumberFormat());
+        lvlText = convertToNewNumberText(listLevel.getNumberText(), listLevel.getLevelNumberingPlaceholderOffsets());
+        return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+    }
+
+    private LevelTuple[] buildOverrideTuples(Paragraph par, int length) {
+        ListFormatOverrideLevel overrideLevel;
+        // find the override for this level
+        if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) {
+            return null;
+        }
+        overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0];
+        if (overrideLevel == null) {
+            return null;
+        }
+        LevelTuple[] levelTuples = new LevelTuple[length];
+        ListLevel listLevel = overrideLevel.getLevel();
+        if (listLevel == null) {
+            return null;
+        }
+        for (int i = 0; i < length; i++) {
+            levelTuples[i] = buildTuple(i, listLevel);
+        }
+
+        return levelTuples;
+
+    }
+
+    private String convertToNewNumberText(String numberText, byte[] numberOffsets) {
+
+        StringBuilder sb = new StringBuilder();
+        int last = 0;
+        for (int i = 0; i < numberOffsets.length;i++) {
+            int offset = (int)numberOffsets[i];
+
+            if (offset == 0){
+                break;
+            }
+            sb.append(numberText.substring(last, offset-1));
+            //need to add one because newer format
+            //adds one.  In .doc, this was the array index;
+            //but in .docx, this is the level number
+            int lvlNum = (int)numberText.charAt(offset-1)+1;
+            sb.append("%"+lvlNum);
+            last = offset;
+        }
+        if (last < numberText.length()) {
+            sb.append(numberText.substring(last));
+        }
+        return sb.toString();
+    }
+
+    private String convertToNewNumFormat(int numberFormat) {
+        switch (numberFormat) {
+            case -1 :
+                return "none";
+            case 0 :
+                return "decimal";
+            case 1 :
+                return "upperRoman";
+            case 2 :
+                return "lowerRoman";
+            case 3 :
+                return "upperLetter";
+            case 4 :
+                return "lowerLetter";
+            case 5 :
+                return "ordinal";
+            case 22 :
+                return "decimalZero";
+            case 23 :
+                return "bullet";
+            case 47 :
+                return "none";
+            default :
+                //do we really want to silently swallow these uncovered cases?
+                throw new RuntimeException("NOT COVERED: "+numberFormat);
+        }
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
------------------------------------------------------------------------------
    svn:eol-style = LF

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu May 28 17:28:40 2015
@@ -58,6 +58,8 @@ public class WordExtractor extends Abstr
 
     private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
     private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+    // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+    private static final String LIST_DELIMITER = " "; 
 
     public WordExtractor(ParseContext context) {
         super(context);
@@ -101,9 +103,10 @@ public class WordExtractor extends Abstr
 
         // Do the main paragraph text
         Range r = document.getRange();
+        ListManager listManager = new ListManager(document);
         for(int i=0; i<r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);
-           i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
+           i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
         }
 
         // Do everything else
@@ -162,13 +165,14 @@ public class WordExtractor extends Abstr
           throws SAXException, IOException, TikaException {
         if (countParagraphs(ranges) > 0) {
             xhtml.startElement("div", "class", type);
+            ListManager listManager = new ListManager(document);
             for (Range r : ranges) {
                 if (r != null) {
                     for(int i=0; i<r.numParagraphs(); i++) {
                         Paragraph p = r.getParagraph(i);
 
                         i += handleParagraph(p, 0, r, document,
-                                FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
+                                FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml);
                      }
                 }
             }
@@ -177,7 +181,7 @@ public class WordExtractor extends Abstr
     }
 
     private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
-          FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
+          FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager,
           XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
        // Note - a poi bug means we can't currently properly recurse
        //  into nested tables, so currently we don't
@@ -194,7 +198,7 @@ public class WordExtractor extends Abstr
 
                 for(int pn=0; pn<cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
-                   handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml);
+                   handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                 }
                 xhtml.endElement("td");
              }
@@ -212,11 +216,15 @@ public class WordExtractor extends Abstr
        }
 
        TagAndStyle tas;
+       String numbering = null;
 
        if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
            StyleDescription style =
               document.getStyleSheet().getStyleDescription(p.getStyleIndex());
            if (style != null && style.getName() != null && style.getName().length() > 0) {
+               if (p.isInList()) {
+                   numbering = listManager.getFormattedNumber(p);
+               }
                tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
            } else {
                tas = new TagAndStyle("p", null);
@@ -231,6 +239,10 @@ public class WordExtractor extends Abstr
            xhtml.startElement(tas.getTag());
        }
 
+       if (numbering != null) {
+           xhtml.characters(numbering);
+       }
+
        for(int j=0; j<p.numCharacterRuns(); j++) {
           CharacterRun cr = p.getCharacterRun(j);
 

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1682287&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java Thu May 28 17:28:40 2015
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.parser.microsoft.AbstractListManager;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+
+
+public class XWPFListManager extends AbstractListManager {
+    private final static boolean OVERRIDE_AVAILABLE;
+    private final static String SKIP_FORMAT = Character.toString((char)61623);//if this shows up as the lvlText, don't show a number
+
+    private final XWPFNumbering numbering;
+    //map of numId (which paragraph series is this a member of?), levelcounts
+    public XWPFListManager(XWPFDocument document) {
+        numbering = document.getNumbering();
+    }
+    static {
+        boolean b = false;
+        try {
+            Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl");
+            b = true;
+        } catch (ClassNotFoundException e) {
+        }
+        b = OVERRIDE_AVAILABLE = false;
+
+    }
+    public String getFormattedNumber(final XWPFParagraph paragraph) {
+        int currNumId = paragraph.getNumID().intValue();
+        CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum();
+        CTDecimalNumber abNum = ctNum.getAbstractNumId();
+        int currAbNumId = abNum.getVal().intValue();
+
+        ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+        LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+        if (lc == null) {
+            lc = loadLevelTuples(abNum);
+        }
+        if (overrideTuples == null) {
+            overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+        }
+
+        String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+
+        listLevelMap.put(currAbNumId, lc);
+        overrideTupleMap.put(currNumId, overrideTuples);
+
+        return formattedString;
+    }
+
+    /**
+     * WARNING: currently always returns null.
+     * TODO: Once CTNumLvl is available to Tika,
+     * we can turn this back on.
+     * @param ctNum number on which to build the overrides
+     * @param length length of intended array
+     * @return null or an array of override tuples of length {@param length}
+     */
+    private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+        return null;
+/*        LevelTuple[] levelTuples = new LevelTuple[length];
+        int overrideLength = ctNum.sizeOfLvlOverrideArray();
+        if (overrideLength == 0) {
+            return null;
+        }
+        for (int i = 0; i < length; i++) {
+            LevelTuple tuple;
+            if (i >= overrideLength) {
+                tuple = new LevelTuple("%"+i+".");
+            } else {
+                CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+                if (ctNumLvl != null) {
+                    tuple = buildTuple(i, ctNumLvl.getLvl());
+                } else {
+                    tuple = new LevelTuple("%"+i+".");
+                }
+            }
+            levelTuples[i] = tuple;
+        }
+        return levelTuples;*/
+    }
+
+
+    private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+        //Unfortunately, we need to go this far into the underlying structure
+        //to get the abstract num information for the edge case where
+        //someone skips a level and the format is not context-free, e.g. "1.B.i".
+        XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+        CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+
+        LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+        for (int i = 0; i < levels.length; i++) {
+            levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+        }
+        return new ParagraphLevelCounter(levels);
+    }
+
+    private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+        boolean isLegal = false;
+        int start = 1;
+        int restart = -1;
+        String lvlText = "%"+level+".";
+        String numFmt = "decimal";
+
+
+        if (ctLvl != null && ctLvl.getIsLgl() != null) {
+            isLegal = true;
+        }
+
+        if (ctLvl != null && ctLvl.getNumFmt() != null &&
+                ctLvl.getNumFmt().getVal() != null) {
+            numFmt = ctLvl.getNumFmt().getVal().toString();
+        }
+        if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+                ctLvl.getLvlRestart().getVal() != null) {
+            restart = ctLvl.getLvlRestart().getVal().intValue();
+        }
+        if (ctLvl != null && ctLvl.getStart() != null &&
+                ctLvl.getStart().getVal() != null) {
+            start = ctLvl.getStart().getVal().intValue();
+        } else {
+
+            //this is a hack. Currently, this gets the lowest possible
+            //start for a given numFmt.  We should probably try to grab the
+            //restartNumberingAfterBreak value in
+            //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">???
+            if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || "decimalZero".equals(numFmt)) {
+                start = 0;
+            } else {
+                start = 1;
+            }
+        }
+        if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) {
+            lvlText = ctLvl.getLvlText().getVal();
+        }
+        return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+    }
+
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
------------------------------------------------------------------------------
    svn:eol-style = LF

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Thu May 28 17:28:40 2015
@@ -61,6 +61,11 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+    // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+    private static final String LIST_DELIMITER = " "; 
+
+
     private XWPFDocument document;
     private XWPFStyles styles;
 
@@ -78,31 +83,32 @@ public class XWPFWordExtractorDecorator
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
-
+        XWPFListManager listManager = new XWPFListManager(document);
         // headers
         if (hfPolicy!=null) {
-            extractHeaders(xhtml, hfPolicy);
+            extractHeaders(xhtml, hfPolicy, listManager);
         }
 
         // process text in the order that it occurs in
-        extractIBodyText(document, xhtml);
+        extractIBodyText(document, listManager, xhtml);
 
         // then all document tables
         if (hfPolicy!=null) {
-            extractFooters(xhtml, hfPolicy);
+            extractFooters(xhtml, hfPolicy, listManager);
         }
     }
 
-    private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
+    private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
+            XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
        for(IBodyElement element : bodyElement.getBodyElements()) {
           if(element instanceof XWPFParagraph) {
              XWPFParagraph paragraph = (XWPFParagraph)element;
-             extractParagraph(paragraph, xhtml);
+             extractParagraph(paragraph, listManager, xhtml);
           }
           if(element instanceof XWPFTable) {
              XWPFTable table = (XWPFTable)element;
-             extractTable(table, xhtml);
+             extractTable(table, listManager, xhtml);
           }
           if (element instanceof XWPFSDT){
              extractSDT((XWPFSDT) element, xhtml);
@@ -120,7 +126,8 @@ public class XWPFWordExtractorDecorator
        xhtml.endElement(tag);
     }
     
-    private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
+    private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
+            XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
        // If this paragraph is actually a whole new section, then
        //  it could have its own headers and footers
@@ -131,7 +138,7 @@ public class XWPFWordExtractorDecorator
            if(ctSectPr != null) {
               headerFooterPolicy =
                   new XWPFHeaderFooterPolicy(document, ctSectPr);
-              extractHeaders(xhtml, headerFooterPolicy);
+              extractHeaders(xhtml, headerFooterPolicy, listManager);
            }
        }
        
@@ -158,6 +165,7 @@ public class XWPFWordExtractorDecorator
           xhtml.startElement(tag, "class", styleClass);
        }
 
+        writeParagraphNumber(paragraph, listManager, xhtml);
        // Output placeholder for any embedded docs:
 
        // TODO: replace w/ XPath/XQuery:
@@ -234,17 +242,30 @@ public class XWPFWordExtractorDecorator
 
        // Also extract any paragraphs embedded in text boxes:
        for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
-           extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), xhtml);
+           extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
        }
 
        // Finish this paragraph
        xhtml.endElement(tag);
 
        if (headerFooterPolicy != null) {
-           extractFooters(xhtml, headerFooterPolicy);
+           extractFooters(xhtml, headerFooterPolicy, listManager);
        }
     }
 
+    private void writeParagraphNumber(XWPFParagraph paragraph,
+                                      XWPFListManager listManager,
+                                      XHTMLContentHandler xhtml) throws SAXException {
+        if (paragraph.getNumIlvl() == null) {
+            return;
+        }
+        String number = listManager.getFormattedNumber(paragraph);
+        if (number != null) {
+            xhtml.characters(number);
+        }
+
+    }
+
     private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
           TmpFormatting fmtg) throws SAXException {
        // Close any still open style tags
@@ -328,7 +349,8 @@ public class XWPFWordExtractorDecorator
        xhtml.characters(run.getContent().getText());
     }
 
-    private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
+    private void extractTable(XWPFTable table, XWPFListManager listManager, 
+            XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
        xhtml.startElement("table");
        xhtml.startElement("tbody");
@@ -337,7 +359,7 @@ public class XWPFWordExtractorDecorator
           for(ICell cell : row.getTableICells()){
               xhtml.startElement("td");
               if (cell instanceof XWPFTableCell) {
-                  extractIBodyText((XWPFTableCell)cell, xhtml);
+                  extractIBodyText((XWPFTableCell)cell, listManager, xhtml);
               } else if (cell instanceof XWPFSDTCell) {
                   xhtml.characters(((XWPFSDTCell)cell).getContent().getText());
               }
@@ -350,45 +372,46 @@ public class XWPFWordExtractorDecorator
     }
     
     private void extractFooters(
-            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
+            XWPFListManager listManager)
             throws SAXException, XmlException, IOException {
         // footers
         if (hfPolicy.getFirstPageFooter() != null) {
-            extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
+            extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
         }
         if (hfPolicy.getEvenPageFooter() != null) {
-            extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
+            extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
         }
         if (hfPolicy.getDefaultFooter() != null) {
-            extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
+            extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
         }
     }
 
     private void extractHeaders(
-            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager)
             throws SAXException, XmlException, IOException {
         if (hfPolicy == null) return;
        
         if (hfPolicy.getFirstPageHeader() != null) {
-            extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
+            extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
         }
 
         if (hfPolicy.getEvenPageHeader() != null) {
-            extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
+            extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
         }
 
         if (hfPolicy.getDefaultHeader() != null) {
-            extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
+            extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
         }
     }
 
-    private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header) throws SAXException, XmlException, IOException {
+    private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException {
 
         for (IBodyElement e : header.getBodyElements()){
            if (e instanceof XWPFParagraph){
-              extractParagraph((XWPFParagraph)e, xhtml);
+              extractParagraph((XWPFParagraph)e, listManager, xhtml);
            } else if (e instanceof XWPFTable){
-              extractTable((XWPFTable)e, xhtml);
+              extractTable((XWPFTable)e, listManager, xhtml);
            } else if (e instanceof XWPFSDT){
               extractSDT((XWPFSDT)e, xhtml);
            }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu May 28 17:28:40 2015
@@ -424,4 +424,78 @@ public class WordParserTest extends Tika
         XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
         assertContains("href=\"http://www.nib.org", result.xml);
     }
+
+    @Test
+    public void testDOCParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_numbered_list.doc").xml;
+        assertContains("1) This", xml);
+        assertContains("a) Is", xml);
+        assertContains("i) A multi", xml);
+        assertContains("ii) Level", xml);
+        assertContains("1. Within cell 1", xml);
+        assertContains("b. Cell b", xml);
+        assertContains("iii) List", xml);
+        assertContains("2) foo", xml);
+        assertContains("ii) baz", xml);
+        assertContains("ii) foo", xml);
+        assertContains("II. bar", xml);
+        assertContains("6. six", xml);
+        assertContains("7. seven", xml);
+        assertContains("a. seven a", xml);
+        assertContains("e. seven e", xml);
+        assertContains("2. A ii 2", xml);
+        assertContains("3. page break list 3", xml);
+        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+        assertContains("1.1.1. 1.1.1", xml);
+        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
+
+        assertContains("add a list here", xml);
+        //TODO: not currently pulling numbers out of comments
+        assertContains(">comment list 1", xml);
+
+    }
+
+    @Test
+    public void testDOCOverrideParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_override_list_numbering.doc").xml;
+
+        //Test 1
+        assertContains("1.1.1.1...1 1.1.1.1...1", xml);
+        assertContains("1st.2.3someText 1st.2.3someText", xml);
+        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+        assertContains("5th 5th", xml);
+
+
+        //Test 2
+        assertContains("1.a.I 1.a.I", xml);
+        //test no reset because level 2 is not sufficient to reset
+        assertContains("1.b.III 1.b.III", xml);
+        //test restarted because of level 0's increment to 2
+        assertContains("2.a.I 2.a.I", xml);
+        //test handling of skipped level
+        assertContains("2.b 2.b", xml);
+
+        //Test 3
+        assertContains("(1)) (1))", xml);
+        //tests start level 1 at 17 and
+        assertContains("2.17 2.17", xml);
+        //tests that isLegal turns everything into decimal
+        assertContains("2.18.2.1 2.18.2.1", xml);
+        assertContains(">2 2", xml);
+
+        //Test4
+        assertContains(">1 1", xml);
+        assertContains(">A A", xml);
+        assertContains(">B B", xml);
+        assertContains(">C C", xml);
+        assertContains(">4 4", xml);
+
+        //Test5
+        assertContains(">00 00", xml);
+        assertContains(">01 01", xml);
+        assertContains(">01. 01.", xml);
+        assertContains(">01..1 01..1", xml);
+        assertContains(">02 02", xml);
+    }
 }
+

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1682287&r1=1682286&r2=1682287&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu May 28 17:28:40 2015
@@ -1178,5 +1178,78 @@ public class OOXMLParserTest extends Tik
         }
 
     }
+
+    @Test
+    public void testDOCXParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_numbered_list.docx").xml;
+        assertContains("1) This", xml);
+        assertContains("a) Is", xml);
+        assertContains("i) A multi", xml);
+        assertContains("ii) Level", xml);
+        assertContains("1. Within cell 1", xml);
+        assertContains("b. Cell b", xml);
+        assertContains("iii) List", xml);
+        assertContains("2) foo", xml);
+        assertContains("ii) baz", xml);
+        assertContains("ii) foo", xml);
+        assertContains("II. bar", xml);
+        assertContains("6. six", xml);
+        assertContains("7. seven", xml);
+        assertContains("a. seven a", xml);
+        assertContains("e. seven e", xml);
+        assertContains("2. A ii 2", xml);
+        assertContains("3. page break list 3", xml);
+        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+        assertContains("1.1.1. 1.1.1", xml);
+        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
+
+        //TODO: comment is not being extracted!
+        //assertContains("add a list here", xml);
+    }
+
+    @Test
+    public void testDOCXOverrideParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_override_list_numbering.docx").xml;
+
+        //Test 1
+        assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml);
+        assertContains("1st.2.3someText 1st.2.3someText", xml);
+        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+        assertContains("5th 5th", xml);
+
+
+        //Test 2
+        assertContains("1.a.I 1.a.I", xml);
+        //test no reset because level 2 is not sufficient to reset
+        assertContains("<p>1.b.III 1.b.III</p>", xml);
+        //test restarted because of level 0's increment to 2
+        assertContains("2.a.I 2.a.I", xml);
+        //test handling of skipped level
+        assertContains("<p>2.b 2.b</p>", xml);
+
+        //Test 3
+        assertContains("(1)) (1))", xml);
+        //tests start level 1 at 17 and
+        assertContains("2.17 2.17", xml);
+        //tests that isLegal turns everything into decimal
+        assertContains("2.18.2.1 2.18.2.1", xml);
+        assertContains("<p>2 2</p>", xml);
+
+        //Test4
+        assertContains("<p>1 1</p>", xml);
+        assertContains("<p>A A</p>", xml);
+        assertContains("<p>B B</p>", xml);
+        //TODO: add this back in once overrides are available via CTNumLvl
+        //assertContains("<p>C C</p>", xml);
+        assertContains("<p>4 4</p>", xml);
+
+        //Test5
+        assertContains(">00 00", xml);
+        assertContains(">01 01", xml);
+        assertContains(">01. 01.", xml);
+        assertContains(">01..1 01..1", xml);
+        assertContains(">02 02", xml);
+    }
 }
 
+

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_numbered_list.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx?rev=1682287&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_override_list_numbering.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream