You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:26 UTC
[20/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index cf92406..6c86765 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -1,1423 +1,1423 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.CodingErrorAction;
-import java.util.Calendar;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.CharsetUtils;
-import org.xml.sax.SAXException;
-
-/* Tokenizes and performs a "shallow" parse of the RTF
- * document, just enough to properly decode the text.
- *
- * TODO: we should cutover to a "real" tokenizer (eg JFlex);
- * it should give better perf, by replacing the excessive
- * "else if" string compares with FSA traversal. */
-
-final class TextExtractor {
-
-    private static final Charset ASCII = Charset.forName("US-ASCII");
-    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
-    private static final Charset MAC_ROMAN = getCharset("MacRoman");
-    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
-    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
-    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
-    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
-    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
-    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
-    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
-    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
-    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
-    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
-    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
-    private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
-    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
-    private static final Charset X_JOHAB = getCharset("x-Johab");
-    private static final Charset CP12582 = getCharset("CP1258");
-    private static final Charset CP12572 = getCharset("CP1257");
-    private static final Charset CP12562 = getCharset("CP1256");
-    private static final Charset CP12552 = getCharset("CP1255");
-    private static final Charset CP12542 = getCharset("CP1254");
-    private static final Charset CP12532 = getCharset("CP1253");
-    private static final Charset CP1252 = getCharset("CP1252");
-    private static final Charset CP12512 = getCharset("CP1251");
-    private static final Charset CP12502 = getCharset("CP1250");
-    private static final Charset CP950 = getCharset("CP950");
-    private static final Charset CP949 = getCharset("CP949");
-    private static final Charset MS9362 = getCharset("MS936");
-    private static final Charset MS8742 = getCharset("MS874");
-    private static final Charset CP866 = getCharset("CP866");
-    private static final Charset CP865 = getCharset("CP865");
-    private static final Charset CP864 = getCharset("CP864");
-    private static final Charset CP863 = getCharset("CP863");
-    private static final Charset CP862 = getCharset("CP862");
-    private static final Charset CP860 = getCharset("CP860");
-    private static final Charset CP852 = getCharset("CP852");
-    private static final Charset CP8502 = getCharset("CP850");
-    private static final Charset CP819 = getCharset("CP819");
-    private static final Charset WINDOWS_720 = getCharset("windows-720");
-    private static final Charset WINDOWS_711 = getCharset("windows-711");
-    private static final Charset WINDOWS_710 = getCharset("windows-710");
-    private static final Charset WINDOWS_709 = getCharset("windows-709");
-    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
-    private static final Charset CP4372 = getCharset("CP437");
-    private static final Charset CP850 = getCharset("cp850");
-    private static final Charset CP437 = getCharset("cp437");
-    private static final Charset MS874 = getCharset("ms874");
-    private static final Charset CP1257 = getCharset("cp1257");
-    private static final Charset CP1256 = getCharset("cp1256");
-    private static final Charset CP1255 = getCharset("cp1255");
-    private static final Charset CP1258 = getCharset("cp1258");
-    private static final Charset CP1254 = getCharset("cp1254");
-    private static final Charset CP1253 = getCharset("cp1253");
-    private static final Charset MS950 = getCharset("ms950");
-    private static final Charset MS936 = getCharset("ms936");
-    private static final Charset MS1361 = getCharset("ms1361");
-    private static final Charset MS932 = getCharset("MS932");
-    private static final Charset CP1251 = getCharset("cp1251");
-    private static final Charset CP1250 = getCharset("cp1250");
-    private static final Charset MAC_THAI = getCharset("MacThai");
-    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
-    private static final Charset MAC_GREEK = getCharset("MacGreek");
-    private static final Charset MAC_ARABIC = getCharset("MacArabic");
-    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
-    private static final Charset JOHAB = getCharset("johab");
-    private static final Charset BIG5 = getCharset("Big5");
-    private static final Charset GB2312 = getCharset("GB2312");
-    private static final Charset MS949 = getCharset("ms949");
-    // The RTF doc has a "font table" that assigns ords
-    // (f0, f1, f2, etc.) to fonts and charsets, using the
-    // \fcharsetN control word.  This mapping maps from the
-    // N to corresponding Java charset:
-    private static final Map<Integer, Charset> FCHARSET_MAP =
-            new HashMap<Integer, Charset>();
-    // The RTF may specify the \ansicpgN charset in the
-    // header; this maps the N to the corresponding Java
-    // character set:
-    private static final Map<Integer, Charset> ANSICPG_MAP =
-            new HashMap<Integer, Charset>();
-
-    static {
-        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
-        // charset 1 is Default
-        // charset 2 is Symbol
-
-        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
-        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
-        FCHARSET_MAP.put(79, MS949); // Mac Hangul
-        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
-        FCHARSET_MAP.put(81, BIG5); // Mac Big5
-        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
-        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
-        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
-        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
-        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
-        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
-        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
-        FCHARSET_MAP.put(89, CP1251); // Mac Russian
-
-        FCHARSET_MAP.put(128, MS932); // Shift JIS
-        FCHARSET_MAP.put(129, MS949); // Hangul
-        FCHARSET_MAP.put(130, MS1361); // Johab
-        FCHARSET_MAP.put(134, MS936); // GB2312
-        FCHARSET_MAP.put(136, MS950); // Big5
-        FCHARSET_MAP.put(161, CP1253); // Greek
-        FCHARSET_MAP.put(162, CP1254); // Turkish
-        FCHARSET_MAP.put(163, CP1258); // Vietnamese
-        FCHARSET_MAP.put(177, CP1255); // Hebrew
-        FCHARSET_MAP.put(178, CP1256); // Arabic
-        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
-        // FCHARSET_MAP.put( 180, "" ); // Arabic user
-        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
-        FCHARSET_MAP.put(186, CP1257); // Baltic
-
-        FCHARSET_MAP.put(204, CP1251); // Russian
-        FCHARSET_MAP.put(222, MS874); // Thai
-        FCHARSET_MAP.put(238, CP1250); // Eastern European
-        FCHARSET_MAP.put(254, CP437); // PC 437
-        FCHARSET_MAP.put(255, CP850); // OEM
-    }
-
-    static {
-        ANSICPG_MAP.put(437, CP4372);   // US IBM
-        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
-
-        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
-        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
-        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
-        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-
-        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
-        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
-        ANSICPG_MAP.put(852, CP852);  // Eastern European
-        ANSICPG_MAP.put(860, CP860);  // Portuguese
-        ANSICPG_MAP.put(862, CP862);  // Hebrew
-        ANSICPG_MAP.put(863, CP863);  // French Canadian
-        ANSICPG_MAP.put(864, CP864);  // Arabic
-        ANSICPG_MAP.put(865, CP865);  // Norwegian
-        ANSICPG_MAP.put(866, CP866);  // Soviet Union
-        ANSICPG_MAP.put(874, MS8742);  // Thai
-        ANSICPG_MAP.put(932, MS932);  // Japanese
-        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
-        ANSICPG_MAP.put(949, CP949);  // Korean
-        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
-        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
-        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
-        ANSICPG_MAP.put(1252, CP1252);  // Western European
-        ANSICPG_MAP.put(1253, CP12532);  // Greek
-        ANSICPG_MAP.put(1254, CP12542);  // Turkish
-        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
-        ANSICPG_MAP.put(1256, CP12562);  // Arabic
-        ANSICPG_MAP.put(1257, CP12572);  // Baltic
-        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
-        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
-        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
-        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
-        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
-        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
-        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
-        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
-        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
-        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
-        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari
-
-        // TODO: in theory these other charsets are simple
-        // shifts off of Devanagari, so we could impl that
-        // here:
-        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
-        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
-        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
-        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
-        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
-        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
-        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
-        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
-        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
-    }
-
-    // Used when we decode bytes -> chars using CharsetDecoder:
-    private final char[] outputArray = new char[128];
-    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
-    // Holds the font table from this RTF doc, mapping
-    // the font number (from \fN control word) to the
-    // corresponding charset:
-    private final Map<Integer, Charset> fontToCharset =
-            new HashMap<Integer, Charset>();
-    // Group stack: when we open a new group, we push
-    // the previous group state onto the stack; when we
-    // close the group, we restore it
-    private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
-    private final StringBuilder pendingBuffer = new StringBuilder();
-    private final XHTMLContentHandler out;
-    private final Metadata metadata;
-    private final RTFEmbObjHandler embObjHandler;
-    // How many next ansi chars we should skip; this
-    // is 0 except when we are still in the "ansi
-    // shadow" after seeing a unicode escape, at which
-    // point it's set to the last ucN skip we had seen:
-    int ansiSkip = 0;
-    private int written = 0;
-    // Hold pending bytes (encoded in the current charset)
-    // for text output:
-    private byte[] pendingBytes = new byte[16];
-    private int pendingByteCount;
-    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-    // Holds pending chars for text output
-    private char[] pendingChars = new char[10];
-    private int pendingCharCount;
-    // Holds chars for a still-being-tokenized control word
-    private byte[] pendingControl = new byte[10];
-    private int pendingControlCount;
-    // Reused when possible:
-    private CharsetDecoder decoder;
-    private Charset lastCharset;
-    private Charset globalCharset = WINDOWS_1252;
-    private int globalDefaultFont = -1;
-    private int curFontID = -1;
-    // Current group state; in theory this initial
-    // GroupState is unused because the RTF doc should
-    // immediately open the top group (start with {):
-    private GroupState groupState = new GroupState();
-    private boolean inHeader = true;
-    private int fontTableState;
-    private int fontTableDepth;
-    // Non null if we are processing metadata (title,
-    // keywords, etc.) inside the info group:
-    private Property nextMetaData;
-    private boolean inParagraph;
-    // Non-zero if we are processing inside a field destination:
-    private int fieldState;
-    // Non-zero list index
-    private int pendingListEnd;
-    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
-    private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
-    private Map<Integer, ListDescriptor> currentListTable;
-    private ListDescriptor currentList;
-    private int listTableLevel = -1;
-    private boolean ignoreLists;
-    // Non-null if we've seen the url for a HYPERLINK but not yet
-    // its text:
-    private String pendingURL;
-    // Used to process the sub-groups inside the upr
-    // group:
-    private int uprState = -1;
-    // Used when extracting CREATION date:
-    private int year, month, day, hour, minute;
-
-    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
-                         RTFEmbObjHandler embObjHandler) {
-        this.metadata = metadata;
-        this.out = out;
-        this.embObjHandler = embObjHandler;
-    }
-
-    private static Charset getCharset(String name) {
-        try {
-            return CharsetUtils.forName(name);
-        } catch (Exception e) {
-            return ASCII;
-        }
-    }
-
-    protected static boolean isHexChar(int ch) {
-        return (ch >= '0' && ch <= '9') ||
-                (ch >= 'a' && ch <= 'f') ||
-                (ch >= 'A' && ch <= 'F');
-    }
-
-    private static boolean isAlpha(int ch) {
-        return (ch >= 'a' && ch <= 'z') ||
-                (ch >= 'A' && ch <= 'Z');
-    }
-
-    private static boolean isDigit(int ch) {
-        return ch >= '0' && ch <= '9';
-    }
-
-    protected static int hexValue(int ch) {
-        if (ch >= '0' && ch <= '9') {
-            return ch - '0';
-        } else if (ch >= 'a' && ch <= 'z') {
-            return 10 + (ch - 'a');
-        } else {
-            assert ch >= 'A' && ch <= 'Z';
-            return 10 + (ch - 'A');
-        }
-    }
-
-    public boolean isIgnoringLists() {
-        return ignoreLists;
-    }
-
-    public void setIgnoreLists(boolean ignore) {
-        this.ignoreLists = ignore;
-    }
-
-    // Push pending bytes or pending chars:
-    private void pushText() throws IOException, SAXException, TikaException {
-        if (pendingByteCount != 0) {
-            assert pendingCharCount == 0;
-            pushBytes();
-        } else {
-            pushChars();
-        }
-    }
-
-    // Buffers the byte (unit in the current charset) for
-    // output:
-    private void addOutputByte(int b) throws IOException, SAXException, TikaException {
-        assert b >= 0 && b < 256 : "byte value out of range: " + b;
-
-        if (pendingCharCount != 0) {
-            pushChars();
-        }
-        if (groupState.pictDepth > 0) {
-            embObjHandler.writeMetadataChar((char) b);
-        } else {
-            // Save the byte in pending buffer:
-            if (pendingByteCount == pendingBytes.length) {
-                // Gradual but exponential growth:
-                final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
-                System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
-                pendingBytes = newArray;
-                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-            }
-            pendingBytes[pendingByteCount++] = (byte) b;
-        }
-    }
-
-    // Buffers a byte as part of a control word:
-    private void addControl(int b) {
-        assert isAlpha(b);
-        // Save the byte in pending buffer:
-        if (pendingControlCount == pendingControl.length) {
-            // Gradual but exponential growth:
-            final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
-            System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
-            pendingControl = newArray;
-        }
-        pendingControl[pendingControlCount++] = (byte) b;
-    }
-
-    // Buffers a UTF16 code unit for output
-    private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
-        if (pendingByteCount != 0) {
-            pushBytes();
-        }
-
-        if (inHeader || fieldState == 1) {
-            pendingBuffer.append(ch);
-        } else if (groupState.sn == true || groupState.sv == true) {
-            embObjHandler.writeMetadataChar(ch);
-        } else {
-            if (pendingCharCount == pendingChars.length) {
-                // Gradual but exponential growth:
-                final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
-                System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
-                pendingChars = newArray;
-            }
-            pendingChars[pendingCharCount++] = ch;
-        }
-    }
-
-    // Shallow parses the entire doc, writing output to
-    // this.out and this.metadata
-    public void extract(InputStream in) throws IOException, SAXException, TikaException {
-//        in = new FilterInputStream(in) {
-//            public int read() throws IOException {
-//                int r = super.read();
-//                System.out.write(r);
-//                System.out.flush();
-//                return r;
-//            }
-//            public int read(byte b[], int off, int len) throws IOException {
-//                int r = super.read(b, off, len);
-//                System.out.write(b, off, r);
-//                System.out.flush();
-//                return r;
-//            }
-//        };
-        extract(new PushbackInputStream(in, 2));
-    }
-
-    private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        out.startDocument();
-
-        while (true) {
-            final int b = in.read();
-            if (b == -1) {
-                break;
-            } else if (b == '\\') {
-                parseControlToken(in);
-            } else if (b == '{') {
-                pushText();
-                processGroupStart(in);
-            } else if (b == '}') {
-                pushText();
-                processGroupEnd();
-                if (groupStates.isEmpty()) {
-                    // parsed document closing brace
-                    break;
-                }
-            } else if (groupState.objdata == true ||
-                    groupState.pictDepth == 1) {
-                embObjHandler.writeHexChar(b);
-            } else if (b != '\r' && b != '\n'
-                    && (!groupState.ignore || nextMetaData != null ||
-                    groupState.sn == true || groupState.sv == true)) {
-                // Linefeed and carriage return are not
-                // significant
-                if (ansiSkip != 0) {
-                    ansiSkip--;
-                } else {
-                    addOutputByte(b);
-                }
-            }
-        }
-
-        endParagraph(false);
-        out.endDocument();
-    }
-
-    private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        int b = in.read();
-        if (b == '\'') {
-            // escaped hex char
-            parseHexChar(in);
-        } else if (isAlpha(b)) {
-            // control word
-            parseControlWord((char) b, in);
-        } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
-            // escaped char
-            addOutputByte(b);
-        } else if (b != -1) {
-            // control symbol, eg \* or \~
-            processControlSymbol((char) b);
-        }
-    }
-
-    private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
-        int hex1 = in.read();
-        if (!isHexChar(hex1)) {
-            // DOC ERROR (malformed hex escape): ignore 
-            in.unread(hex1);
-            return;
-        }
-
-        int hex2 = in.read();
-        if (!isHexChar(hex2)) {
-            // TODO: log a warning here, somehow?
-            // DOC ERROR (malformed hex escape):
-            // ignore
-            in.unread(hex2);
-            return;
-        }
-
-        if (ansiSkip != 0) {
-            // Skip this ansi char since we are
-            // still in the shadow of a unicode
-            // escape:
-            ansiSkip--;
-        } else {
-            // Unescape:
-            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
-        }
-    }
-
-    private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
-        addControl(firstChar);
-
-        int b = in.read();
-        while (isAlpha(b)) {
-            addControl(b);
-            b = in.read();
-        }
-
-        boolean hasParam = false;
-        boolean negParam = false;
-        if (b == '-') {
-            negParam = true;
-            hasParam = true;
-            b = in.read();
-        }
-
-        int param = 0;
-        while (isDigit(b)) {
-            param *= 10;
-            param += (b - '0');
-            hasParam = true;
-            b = in.read();
-        }
-
-        // space is consumed as part of the
-        // control word, but is not added to the
-        // control word
-        if (b != ' ') {
-            in.unread(b);
-        }
-
-        if (hasParam) {
-            if (negParam) {
-                param = -param;
-            }
-            processControlWord(param, in);
-        } else {
-            processControlWord();
-        }
-
-        pendingControlCount = 0;
-    }
-
-    private void lazyStartParagraph() throws IOException, SAXException, TikaException {
-        if (!inParagraph) {
-            // Ensure </i></b> order
-            if (groupState.italic) {
-                end("i");
-            }
-            if (groupState.bold) {
-                end("b");
-            }
-            if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
-                endList(pendingListEnd);
-                pendingListEnd = 0;
-            }
-            if (inList() && pendingListEnd != groupState.list) {
-                startList(groupState.list);
-            }
-            if (inList()) {
-                out.startElement("li");
-            } else {
-                out.startElement("p");
-            }
-
-            // Ensure <b><i> order
-            if (groupState.bold) {
-                start("b");
-            }
-            if (groupState.italic) {
-                start("i");
-            }
-            inParagraph = true;
-        }
-    }
-
-    private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
-        pushText();
-        //maintain consecutive new lines
-        if (!inParagraph) {
-            lazyStartParagraph();
-        }
-        if (inParagraph) {
-            if (groupState.italic) {
-                end("i");
-                groupState.italic = preserveStyles;
-            }
-            if (groupState.bold) {
-                end("b");
-                groupState.bold = preserveStyles;
-            }
-            if (inList()) {
-                out.endElement("li");
-            } else {
-                out.endElement("p");
-            }
-
-            if (preserveStyles && (groupState.bold || groupState.italic)) {
-                start("p");
-                if (groupState.bold) {
-                    start("b");
-                }
-                if (groupState.italic) {
-                    start("i");
-                }
-                inParagraph = true;
-            } else {
-                inParagraph = false;
-            }
-        }
-
-        // Ensure closing the list at document end
-        if (!preserveStyles && pendingListEnd != 0) {
-            endList(pendingListEnd);
-            pendingListEnd = 0;
-        }
-    }
-
-    // Push pending UTF16 units to out ContentHandler
-    private void pushChars() throws IOException, SAXException, TikaException {
-        if (pendingCharCount != 0) {
-            lazyStartParagraph();
-            out.characters(pendingChars, 0, pendingCharCount);
-            pendingCharCount = 0;
-        }
-    }
-
-    // Decodes the buffered bytes in pendingBytes
-    // into UTF16 code units, and sends the characters
-    // to the out ContentHandler, if we are in the body,
-    // else appends the characters to the pendingBuffer
-    private void pushBytes() throws IOException, SAXException, TikaException {
-        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
-
-            final CharsetDecoder decoder = getDecoder();
-            pendingByteBuffer.limit(pendingByteCount);
-            assert pendingByteBuffer.position() == 0;
-            assert outputBuffer.position() == 0;
-
-            while (true) {
-                // We pass true for endOfInput because, when
-                // we are called, we should have seen a
-                // complete sequence of characters for this
-                // charset:
-                final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
-
-                final int pos = outputBuffer.position();
-                if (pos > 0) {
-                    if (inHeader || fieldState == 1) {
-                        pendingBuffer.append(outputArray, 0, pos);
-                    } else {
-                        lazyStartParagraph();
-                        out.characters(outputArray, 0, pos);
-                    }
-                    outputBuffer.position(0);
-                }
-
-                if (result == CoderResult.UNDERFLOW) {
-                    break;
-                }
-            }
-
-            while (true) {
-                final CoderResult result = decoder.flush(outputBuffer);
-
-                final int pos = outputBuffer.position();
-                if (pos > 0) {
-                    if (inHeader || fieldState == 1) {
-                        pendingBuffer.append(outputArray, 0, pos);
-                    } else {
-                        lazyStartParagraph();
-                        out.characters(outputArray, 0, pos);
-                    }
-                    outputBuffer.position(0);
-                }
-
-                if (result == CoderResult.UNDERFLOW) {
-                    break;
-                }
-            }
-
-            // Reset for next decode
-            decoder.reset();
-            pendingByteBuffer.position(0);
-        }
-
-        pendingByteCount = 0;
-    }
-
-    // NOTE: s must be ascii alpha only
-    private boolean equals(String s) {
-        if (pendingControlCount != s.length()) {
-            return false;
-        }
-        for (int idx = 0; idx < pendingControlCount; idx++) {
-            assert isAlpha(s.charAt(idx));
-            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
-        switch (ch) {
-            case '~':
-                // Non-breaking space -> unicode NON-BREAKING SPACE
-                addOutputChar('\u00a0');
-                break;
-            case '*':
-                // Ignorable destination (control words defined after
-                // the 1987 RTF spec). These are already handled by
-                // processGroupStart()
-                break;
-            case '-':
-                // Optional hyphen -> unicode SOFT HYPHEN
-                addOutputChar('\u00ad');
-                break;
-            case '_':
-                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
-                addOutputChar('\u2011');
-                break;
-            default:
-                break;
-        }
-    }
-
-    private CharsetDecoder getDecoder() throws TikaException {
-        Charset charset = getCharset();
-
-        // Common case: charset is same as last time, so
-        // just reuse it:
-        if (lastCharset == null || !charset.equals(lastCharset)) {
-            decoder = charset.newDecoder();
-            decoder.onMalformedInput(CodingErrorAction.REPLACE);
-            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
-            lastCharset = charset;
-        }
-
-        return decoder;
-    }
-
-    // Return current charset in-use
-    private Charset getCharset() throws TikaException {
-        // If a specific font (fN) was set, use its charset
-        if (groupState.fontCharset != null) {
-            return groupState.fontCharset;
-        }
-
-        // Else, if global default font (defN) was set, use that one
-        if (globalDefaultFont != -1 && !inHeader) {
-            Charset cs = fontToCharset.get(globalDefaultFont);
-            if (cs != null) {
-                return cs;
-            }
-        }
-
-        // Else, use the global charset
-        if (globalCharset == null) {
-            throw new TikaException("unable to determine charset");
-        }
-
-        return globalCharset;
-    }
-
-    // Handle control word that takes a parameter:
-    private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
-        // TODO: afN?  (associated font number)
-
-        // TODO: do these alter text output...?
-        /*
-            } else if (equals("stshfdbch")) {
-                // font to be used by default in
-                // style sheet for East Asian chars
-                // arg N is font table entry
-            } else if (equals("stshfloch")) {
-                // font to be used by default in
-                // style sheet for ASCII chars
-                // arg N is font table entry
-            } else if (equals("stshfhich")) {
-                // font to be used by default in
-                // style sheet for High Ansi chars
-                // arg N is font table entry
-            } else if (equals("stshfbi")) {
-                // style sheet for Complex Scripts (BIDI) chars
-                // arg N is font table entry
-                */
-
-        // TODO: inefficient that we check equals N times;
-        // we'd get better perf w/ real lexer (eg
-        // JFlex), which uses single-pass FSM to do cmp:
-        if (inHeader) {
-            if (equals("ansicpg")) {
-                // ANSI codepage
-                Charset cs = ANSICPG_MAP.get(param);
-                if (cs != null) {
-                    globalCharset = cs;
-                }
-            } else if (equals("deff")) {
-                // Default font
-                globalDefaultFont = param;
-            } else if (equals("nofpages")) {
-                metadata.add(Office.PAGE_COUNT, Integer.toString(param));
-            } else if (equals("nofwords")) {
-                metadata.add(Office.WORD_COUNT, Integer.toString(param));
-            } else if (equals("nofchars")) {
-                metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
-            } else if (equals("yr")) {
-                year = param;
-            } else if (equals("mo")) {
-                month = param;
-            } else if (equals("dy")) {
-                day = param;
-            } else if (equals("hr")) {
-                hour = param;
-            } else if (equals("min")) {
-                minute = param;
-            }
-
-            if (fontTableState == 1) {
-                // Still inside font table -- record the
-                // mappings of fN to the fcharset:
-                if (groupState.depth < fontTableDepth) {
-                    fontTableState = 2;
-                } else {
-                    if (equals("f")) {
-                        // Start new font definition
-                        curFontID = param;
-                    } else if (equals("fcharset")) {
-                        Charset cs = FCHARSET_MAP.get(param);
-                        if (cs != null) {
-                            fontToCharset.put(curFontID, cs);
-                        }
-                    }
-                }
-            }
-
-            if (currentList != null) {
-                if (equals("listid")) {
-                    currentList.id = param;
-                    currentListTable.put(currentList.id, currentList);
-                } else if (equals("listtemplateid")) {
-                    currentList.templateID = param;
-                } else if (equals("levelnfc") || equals("levelnfcn")) {
-                    //sanity check to make sure list information isn't corrupt
-                    if (listTableLevel > -1 &&
-                            listTableLevel < currentList.numberType.length) {
-                        currentList.numberType[listTableLevel] = param;
-                    }
-                }
-            }
-        } else {
-            // In document
-            if (equals("b")) {
-                // b0
-                assert param == 0;
-                if (groupState.bold) {
-                    pushText();
-                    if (groupState.italic) {
-                        end("i");
-                    }
-                    end("b");
-                    if (groupState.italic) {
-                        start("i");
-                    }
-                    groupState.bold = false;
-                }
-            } else if (equals("i")) {
-                // i0
-                assert param == 0;
-                if (groupState.italic) {
-                    pushText();
-                    end("i");
-                    groupState.italic = false;
-                }
-            } else if (equals("f")) {
-                // Change current font
-                Charset fontCharset = fontToCharset.get(param);
-
-                // Push any buffered text before changing
-                // font:
-                pushText();
-
-                if (fontCharset != null) {
-                    groupState.fontCharset = fontCharset;
-                } else {
-                    // DOC ERROR: font change referenced a
-                    // non-table'd font number
-                    // TODO: log a warning?  Throw an exc?
-                    groupState.fontCharset = null;
-                }
-            } else if (equals("ls")) {
-                groupState.list = param;
-            } else if (equals("lslvl")) {
-                groupState.listLevel = param;
-            }
-        }
-
-        // Process unicode escape. This can appear in doc
-        // or in header, since the metadata (info) fields
-        // in the header can be unicode escaped as well:
-        if (equals("u")) {
-            // Unicode escape
-            if (!groupState.ignore || groupState.sv || groupState.sn) {
-                final char utf16CodeUnit = (char) (param & 0xffff);
-                addOutputChar(utf16CodeUnit);
-            }
-
-            // After seeing a unicode escape we must
-            // skip the next ucSkip ansi chars (the
-            // "unicode shadow")
-            ansiSkip = groupState.ucSkip;
-        } else if (equals("uc")) {
-            // Change unicode shadow length
-            groupState.ucSkip = param;
-        } else if (equals("bin")) {
-            if (param >= 0) {
-                if (groupState.pictDepth == 1) {
-                    try {
-                        embObjHandler.writeBytes(in, param);
-                    } catch (IOException e) {
-                        //param was out of bounds or something went wrong during writing.
-                        //skip this obj and move on
-                        //TODO: log.warn
-                        embObjHandler.reset();
-                    }
-                } else {
-                    IOUtils.skipFully(in, param);
-                }
-            } else {
-                // log some warning?
-            }
-        }
-    }
-
-    private boolean inList() {
-        return !ignoreLists && groupState.list != 0;
-    }
-
-    /**
-     * Marks the current list as pending to end. This is done to be able to merge list items of
-     * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
-     * <code>"ol"</code>).
-     */
-    private void pendingListEnd() {
-        pendingListEnd = groupState.list;
-        groupState.list = 0;
-    }
-
-    /**
-     * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
-     * type for the given <code>listID</code>.
-     *
-     * @param listID The ID of the list.
-     * @throws IOException
-     * @throws SAXException
-     * @throws TikaException
-     */
-    private void endList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
-            out.endElement(isUnorderedList(listID) ? "ul" : "ol");
-        }
-    }
-
-    /**
-     * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
-     * type for the given <code>listID</code>.
-     *
-     * @param listID The ID of the list.
-     * @throws IOException
-     * @throws SAXException
-     * @throws TikaException
-     */
-    private void startList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
-            out.startElement(isUnorderedList(listID) ? "ul" : "ol");
-        }
-    }
-
-    private boolean isUnorderedList(int listID) {
-        ListDescriptor list = listTable.get(listID);
-        if (list != null) {
-            return list.isUnordered(groupState.listLevel);
-        }
-        return true;
-    }
-
-    private void end(String tag) throws IOException, SAXException, TikaException {
-        out.endElement(tag);
-    }
-
-    private void start(String tag) throws IOException, SAXException, TikaException {
-        out.startElement(tag);
-    }
-
-    // Handle non-parameter control word:
-    private void processControlWord() throws IOException, SAXException, TikaException {
-        if (inHeader) {
-            if (equals("ansi")) {
-                globalCharset = WINDOWS_1252;
-            } else if (equals("pca")) {
-                globalCharset = CP850;
-            } else if (equals("pc")) {
-                globalCharset = CP437;
-            } else if (equals("mac")) {
-                globalCharset = MAC_ROMAN;
-            }
-
-            if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
-                groupState.ignore = true;
-            } else if (equals("listtable")) {
-                currentListTable = listTable;
-            } else if (equals("listoverridetable")) {
-                currentListTable = listOverrideTable;
-            }
-
-            if (uprState == -1) {
-                // TODO: we can also parse \creatim, \revtim,
-                // \printim, \version, etc.
-                if (equals("author")) {
-                    nextMetaData = TikaCoreProperties.CREATOR;
-                } else if (equals("title")) {
-                    nextMetaData = TikaCoreProperties.TITLE;
-                } else if (equals("subject")) {
-                    // TODO: Move to OO subject in Tika 2.0
-                    nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
-                } else if (equals("keywords")) {
-                    nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
-                } else if (equals("category")) {
-                    nextMetaData = OfficeOpenXMLCore.CATEGORY;
-                } else if (equals("comment")) {
-                    nextMetaData = TikaCoreProperties.COMMENTS;
-                } else if (equals("company")) {
-                    nextMetaData = OfficeOpenXMLExtended.COMPANY;
-                } else if (equals("manager")) {
-                    nextMetaData = OfficeOpenXMLExtended.MANAGER;
-                } else if (equals("template")) {
-                    nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
-                } else if (equals("creatim")) {
-                    nextMetaData = TikaCoreProperties.CREATED;
-                }
-            }
-
-            if (fontTableState == 0) {
-                // Didn't see font table yet
-                if (equals("fonttbl")) {
-                    fontTableState = 1;
-                    fontTableDepth = groupState.depth;
-                }
-            } else if (fontTableState == 1) {
-                // Inside font table
-                if (groupState.depth < fontTableDepth) {
-                    fontTableState = 2;
-                }
-            }
-
-            // List table handling
-            if (currentListTable != null) {
-                if (equals("list") || equals("listoverride")) {
-                    currentList = new ListDescriptor();
-                    listTableLevel = -1;
-                } else if (currentList != null) {
-                    if (equals("liststylename")) {
-                        currentList.isStyle = true;
-                    } else if (equals("listlevel")) {
-                        listTableLevel++;
-                    }
-                }
-            }
-
-            if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
-                inHeader = false;
-            }
-        } else {
-            if (equals("b")) {
-                if (!groupState.bold) {
-                    pushText();
-                    lazyStartParagraph();
-                    if (groupState.italic) {
-                        // Make sure nesting is always <b><i>
-                        end("i");
-                    }
-                    groupState.bold = true;
-                    start("b");
-                    if (groupState.italic) {
-                        start("i");
-                    }
-                }
-            } else if (equals("i")) {
-                if (!groupState.italic) {
-                    pushText();
-                    lazyStartParagraph();
-                    groupState.italic = true;
-                    start("i");
-                }
-            }
-        }
-
-        final boolean ignored = groupState.ignore;
-
-        if (equals("pard")) {
-            // Reset styles
-            pushText();
-            if (groupState.italic) {
-                end("i");
-                groupState.italic = false;
-            }
-            if (groupState.bold) {
-                end("b");
-                groupState.bold = false;
-            }
-            if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
-                pendingListEnd();
-            }
-        } else if (equals("par")) {
-            if (!ignored) {
-                endParagraph(true);
-            }
-        } else if (equals("shptxt")) {
-            pushText();
-            // Text inside a shape
-            groupState.ignore = false;
-        } else if (equals("atnid")) {
-            pushText();
-            // Annotation ID
-            groupState.ignore = false;
-        } else if (equals("atnauthor")) {
-            pushText();
-            // Annotation author
-            groupState.ignore = false;
-        } else if (equals("annotation")) {
-            pushText();
-            // Annotation
-            groupState.ignore = false;
-        } else if (equals("listtext")) {
-            groupState.ignore = true;
-        } else if (equals("cell")) {
-            // TODO: we should produce a table output here?
-            //addOutputChar(' ');
-            endParagraph(true);
-        } else if (equals("sp")) {
-            groupState.sp = true;
-        } else if (equals("sn")) {
-            embObjHandler.startSN();
-            groupState.sn = true;
-        } else if (equals("sv")) {
-            embObjHandler.startSV();
-            groupState.sv = true;
-        } else if (equals("object")) {
-            pushText();
-            embObjHandler.setInObject(true);
-            groupState.object = true;
-        } else if (equals("objdata")) {
-            groupState.objdata = true;
-            embObjHandler.startObjData();
-        } else if (equals("pict")) {
-            pushText();
-            // TODO: create img tag?  but can that support
-            // embedded image data?
-            groupState.pictDepth = 1;
-            embObjHandler.startPict();
-        } else if (equals("line")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("column")) {
-            if (!ignored) {
-                addOutputChar(' ');
-            }
-        } else if (equals("page")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("softline")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("softcolumn")) {
-            if (!ignored) {
-                addOutputChar(' ');
-            }
-        } else if (equals("softpage")) {
-            if (!ignored) {
-                addOutputChar('\n');
-            }
-        } else if (equals("tab")) {
-            if (!ignored) {
-                addOutputChar('\t');
-            }
-        } else if (equals("upr")) {
-            uprState = 0;
-        } else if (equals("ud") && uprState == 1) {
-            uprState = -1;
-            // 2nd group inside the upr destination, which
-            // contains the unicode encoding of the text, so
-            // we want to keep that:
-            groupState.ignore = false;
-        } else if (equals("bullet")) {
-            if (!ignored) {
-                // unicode BULLET
-                addOutputChar('\u2022');
-            }
-        } else if (equals("endash")) {
-            if (!ignored) {
-                // unicode EN DASH
-                addOutputChar('\u2013');
-            }
-        } else if (equals("emdash")) {
-            if (!ignored) {
-                // unicode EM DASH
-                addOutputChar('\u2014');
-            }
-        } else if (equals("enspace")) {
-            if (!ignored) {
-                // unicode EN SPACE
-                addOutputChar('\u2002');
-            }
-        } else if (equals("qmspace")) {
-            if (!ignored) {
-                // quarter em space -> unicode FOUR-PER-EM SPACE
-                addOutputChar('\u2005');
-            }
-        } else if (equals("emspace")) {
-            if (!ignored) {
-                // unicode EM SPACE
-                addOutputChar('\u2003');
-            }
-        } else if (equals("lquote")) {
-            if (!ignored) {
-                // unicode LEFT SINGLE QUOTATION MARK
-                addOutputChar('\u2018');
-            }
-        } else if (equals("rquote")) {
-            if (!ignored) {
-                // unicode RIGHT SINGLE QUOTATION MARK
-                addOutputChar('\u2019');
-            }
-        } else if (equals("ldblquote")) {
-            if (!ignored) {
-                // unicode LEFT DOUBLE QUOTATION MARK
-                addOutputChar('\u201C');
-            }
-        } else if (equals("rdblquote")) {
-            if (!ignored) {
-                // unicode RIGHT DOUBLE QUOTATION MARK
-                addOutputChar('\u201D');
-            }
-        } else if (equals("fldinst")) {
-            fieldState = 1;
-            groupState.ignore = false;
-        } else if (equals("fldrslt") && fieldState == 2) {
-            assert pendingURL != null;
-            lazyStartParagraph();
-            out.startElement("a", "href", pendingURL);
-            pendingURL = null;
-            fieldState = 3;
-            groupState.ignore = false;
-        }
-    }
-
-    // Push new GroupState
-    private void processGroupStart(PushbackInputStream in) throws IOException {
-        ansiSkip = 0;
-        // Push current groupState onto the stack
-        groupStates.add(groupState);
-
-        // Make new GroupState
-        groupState = new GroupState(groupState);
-        assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth;
-
-        if (uprState == 0) {
-            uprState = 1;
-            groupState.ignore = true;
-        }
-
-        // Check for ignorable groups. Note that
-        // sometimes we un-ignore within this group, eg
-        // when handling upr escape.
-        int b2 = in.read();
-        if (b2 == '\\') {
-            int b3 = in.read();
-            if (b3 == '*') {
-                groupState.ignore = true;
-            }
-            in.unread(b3);
-        }
-        in.unread(b2);
-    }
-
-    // Pop current GroupState
-    private void processGroupEnd() throws IOException, SAXException, TikaException {
-        if (inHeader) {
-            if (nextMetaData != null) {
-                if (nextMetaData == TikaCoreProperties.CREATED) {
-                    Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
-                    cal.set(year, month - 1, day, hour, minute, 0);
-                    metadata.set(nextMetaData, cal.getTime());
-                } else if (nextMetaData.isMultiValuePermitted()) {
-                    metadata.add(nextMetaData, pendingBuffer.toString());
-                } else {
-                    metadata.set(nextMetaData, pendingBuffer.toString());
-                }
-                nextMetaData = null;
-            }
-            pendingBuffer.setLength(0);
-        }
-
-        assert groupState.depth > 0;
-        ansiSkip = 0;
-
-        if (groupState.objdata == true) {
-            embObjHandler.handleCompletedObject();
-            groupState.objdata = false;
-        } else if (groupState.pictDepth > 0) {
-            if (groupState.sn == true) {
-                embObjHandler.endSN();
-            } else if (groupState.sv == true) {
-                embObjHandler.endSV();
-            } else if (groupState.sp == true) {
-                embObjHandler.endSP();
-            } else if (groupState.pictDepth == 1) {
-                embObjHandler.handleCompletedObject();
-            }
-        }
-
-        if (groupState.object == true) {
-            embObjHandler.setInObject(false);
-        }
-
-        // Be robust if RTF doc is corrupt (has too many
-        // closing }s):
-        // TODO: log a warning?
-        if (groupStates.size() > 0) {
-            // Restore group state:
-            final GroupState outerGroupState = groupStates.removeLast();
-
-            // Close italic, if outer does not have italic or
-            // bold changed:
-            if (groupState.italic) {
-                if (!outerGroupState.italic ||
-                        groupState.bold != outerGroupState.bold) {
-                    end("i");
-                    groupState.italic = false;
-                }
-            }
-
-            // Close bold
-            if (groupState.bold && !outerGroupState.bold) {
-                end("b");
-            }
-
-            // Open bold
-            if (!groupState.bold && outerGroupState.bold) {
-                start("b");
-            }
-
-            // Open italic
-            if (!groupState.italic && outerGroupState.italic) {
-                start("i");
-            }
-            groupState = outerGroupState;
-        }
-        assert groupStates.size() == groupState.depth;
-
-        if (fieldState == 1) {
-            String s = pendingBuffer.toString().trim();
-            pendingBuffer.setLength(0);
-            if (s.startsWith("HYPERLINK")) {
-                s = s.substring(9).trim();
-                // TODO: what other instructions can be in a
-                // HYPERLINK destination?
-                final boolean isLocalLink = s.contains("\\l ");
-                int idx = s.indexOf('"');
-                if (idx != -1) {
-                    int idx2 = s.indexOf('"', 1 + idx);
-                    if (idx2 != -1) {
-                        s = s.substring(1 + idx, idx2);
-                    }
-                }
-                pendingURL = (isLocalLink ? "#" : "") + s;
-                fieldState = 2;
-            } else {
-                fieldState = 0;
-            }
-
-            // TODO: we could process the other known field
-            // types.  Right now, we will extract their text
-            // inlined, but fail to record them in metadata
-            // as a field value.
-        } else if (fieldState == 3) {
-            out.endElement("a");
-            fieldState = 0;
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+    private static final Charset ASCII = Charset.forName("US-ASCII");
+    private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
+    private static final Charset MAC_ROMAN = getCharset("MacRoman");
+    private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
+    private static final Charset WINDOWS_57011 = getCharset("windows-57011");
+    private static final Charset WINDOWS_57010 = getCharset("windows-57010");
+    private static final Charset WINDOWS_57009 = getCharset("windows-57009");
+    private static final Charset WINDOWS_57008 = getCharset("windows-57008");
+    private static final Charset WINDOWS_57007 = getCharset("windows-57007");
+    private static final Charset WINDOWS_57006 = getCharset("windows-57006");
+    private static final Charset WINDOWS_57005 = getCharset("windows-57005");
+    private static final Charset WINDOWS_57004 = getCharset("windows-57004");
+    private static final Charset WINDOWS_57003 = getCharset("windows-57003");
+    private static final Charset X_ISCII91 = getCharset("x-ISCII91");
+    private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
+    private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
+    private static final Charset X_JOHAB = getCharset("x-Johab");
+    private static final Charset CP12582 = getCharset("CP1258");
+    private static final Charset CP12572 = getCharset("CP1257");
+    private static final Charset CP12562 = getCharset("CP1256");
+    private static final Charset CP12552 = getCharset("CP1255");
+    private static final Charset CP12542 = getCharset("CP1254");
+    private static final Charset CP12532 = getCharset("CP1253");
+    private static final Charset CP1252 = getCharset("CP1252");
+    private static final Charset CP12512 = getCharset("CP1251");
+    private static final Charset CP12502 = getCharset("CP1250");
+    private static final Charset CP950 = getCharset("CP950");
+    private static final Charset CP949 = getCharset("CP949");
+    private static final Charset MS9362 = getCharset("MS936");
+    private static final Charset MS8742 = getCharset("MS874");
+    private static final Charset CP866 = getCharset("CP866");
+    private static final Charset CP865 = getCharset("CP865");
+    private static final Charset CP864 = getCharset("CP864");
+    private static final Charset CP863 = getCharset("CP863");
+    private static final Charset CP862 = getCharset("CP862");
+    private static final Charset CP860 = getCharset("CP860");
+    private static final Charset CP852 = getCharset("CP852");
+    private static final Charset CP8502 = getCharset("CP850");
+    private static final Charset CP819 = getCharset("CP819");
+    private static final Charset WINDOWS_720 = getCharset("windows-720");
+    private static final Charset WINDOWS_711 = getCharset("windows-711");
+    private static final Charset WINDOWS_710 = getCharset("windows-710");
+    private static final Charset WINDOWS_709 = getCharset("windows-709");
+    private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
+    private static final Charset CP4372 = getCharset("CP437");
+    private static final Charset CP850 = getCharset("cp850");
+    private static final Charset CP437 = getCharset("cp437");
+    private static final Charset MS874 = getCharset("ms874");
+    private static final Charset CP1257 = getCharset("cp1257");
+    private static final Charset CP1256 = getCharset("cp1256");
+    private static final Charset CP1255 = getCharset("cp1255");
+    private static final Charset CP1258 = getCharset("cp1258");
+    private static final Charset CP1254 = getCharset("cp1254");
+    private static final Charset CP1253 = getCharset("cp1253");
+    private static final Charset MS950 = getCharset("ms950");
+    private static final Charset MS936 = getCharset("ms936");
+    private static final Charset MS1361 = getCharset("ms1361");
+    private static final Charset MS932 = getCharset("MS932");
+    private static final Charset CP1251 = getCharset("cp1251");
+    private static final Charset CP1250 = getCharset("cp1250");
+    private static final Charset MAC_THAI = getCharset("MacThai");
+    private static final Charset MAC_TURKISH = getCharset("MacTurkish");
+    private static final Charset MAC_GREEK = getCharset("MacGreek");
+    private static final Charset MAC_ARABIC = getCharset("MacArabic");
+    private static final Charset MAC_HEBREW = getCharset("MacHebrew");
+    private static final Charset JOHAB = getCharset("johab");
+    private static final Charset BIG5 = getCharset("Big5");
+    private static final Charset GB2312 = getCharset("GB2312");
+    private static final Charset MS949 = getCharset("ms949");
+    // The RTF doc has a "font table" that assigns ords
+    // (f0, f1, f2, etc.) to fonts and charsets, using the
+    // \fcharsetN control word.  This mapping maps from the
+    // N to corresponding Java charset:
+    private static final Map<Integer, Charset> FCHARSET_MAP =
+            new HashMap<Integer, Charset>();
+    // The RTF may specify the \ansicpgN charset in the
+    // header; this maps the N to the corresponding Java
+    // character set:
+    private static final Map<Integer, Charset> ANSICPG_MAP =
+            new HashMap<Integer, Charset>();
+
+    static {
+        FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
+        // charset 1 is Default
+        // charset 2 is Symbol
+
+        FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
+        FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
+        FCHARSET_MAP.put(79, MS949); // Mac Hangul
+        FCHARSET_MAP.put(80, GB2312); // Mac GB2312
+        FCHARSET_MAP.put(81, BIG5); // Mac Big5
+        FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
+        FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
+        FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
+        FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
+        FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
+        FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
+        FCHARSET_MAP.put(88, CP1250); // Mac East Europe
+        FCHARSET_MAP.put(89, CP1251); // Mac Russian
+
+        FCHARSET_MAP.put(128, MS932); // Shift JIS
+        FCHARSET_MAP.put(129, MS949); // Hangul
+        FCHARSET_MAP.put(130, MS1361); // Johab
+        FCHARSET_MAP.put(134, MS936); // GB2312
+        FCHARSET_MAP.put(136, MS950); // Big5
+        FCHARSET_MAP.put(161, CP1253); // Greek
+        FCHARSET_MAP.put(162, CP1254); // Turkish
+        FCHARSET_MAP.put(163, CP1258); // Vietnamese
+        FCHARSET_MAP.put(177, CP1255); // Hebrew
+        FCHARSET_MAP.put(178, CP1256); // Arabic
+        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+        // FCHARSET_MAP.put( 180, "" ); // Arabic user
+        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+        FCHARSET_MAP.put(186, CP1257); // Baltic
+
+        FCHARSET_MAP.put(204, CP1251); // Russian
+        FCHARSET_MAP.put(222, MS874); // Thai
+        FCHARSET_MAP.put(238, CP1250); // Eastern European
+        FCHARSET_MAP.put(254, CP437); // PC 437
+        FCHARSET_MAP.put(255, CP850); // OEM
+    }
+
+    static {
+        ANSICPG_MAP.put(437, CP4372);   // US IBM
+        ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)
+
+        ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
+        ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
+        ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
+        ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+
+        ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
+        ANSICPG_MAP.put(852, CP852);  // Eastern European
+        ANSICPG_MAP.put(860, CP860);  // Portuguese
+        ANSICPG_MAP.put(862, CP862);  // Hebrew
+        ANSICPG_MAP.put(863, CP863);  // French Canadian
+        ANSICPG_MAP.put(864, CP864);  // Arabic
+        ANSICPG_MAP.put(865, CP865);  // Norwegian
+        ANSICPG_MAP.put(866, CP866);  // Soviet Union
+        ANSICPG_MAP.put(874, MS8742);  // Thai
+        ANSICPG_MAP.put(932, MS932);  // Japanese
+        ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
+        ANSICPG_MAP.put(949, CP949);  // Korean
+        ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
+        ANSICPG_MAP.put(1250, CP12502);  // Eastern European
+        ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
+        ANSICPG_MAP.put(1252, CP1252);  // Western European
+        ANSICPG_MAP.put(1253, CP12532);  // Greek
+        ANSICPG_MAP.put(1254, CP12542);  // Turkish
+        ANSICPG_MAP.put(1255, CP12552);  // Hebrew
+        ANSICPG_MAP.put(1256, CP12562);  // Arabic
+        ANSICPG_MAP.put(1257, CP12572);  // Baltic
+        ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
+        ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
+        ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
+        ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
+        ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
+        ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
+        ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
+        ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
+        ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
+        ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
+        ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari
+
+        // TODO: in theory these other charsets are simple
+        // shifts off of Devanagari, so we could impl that
+        // here:
+        ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
+        ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
+        ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
+        ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
+        ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
+        ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
+        ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
+        ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
+        ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
+    }
+
+    // Used when we decode bytes -> chars using CharsetDecoder:
+    private final char[] outputArray = new char[128];
+    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+    // Holds the font table from this RTF doc, mapping
+    // the font number (from \fN control word) to the
+    // corresponding charset:
+    private final Map<Integer, Charset> fontToCharset =
+            new HashMap<Integer, Charset>();
+    // Group stack: when we open a new group, we push
+    // the previous group state onto the stack; when we
+    // close the group, we restore it
+    private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+    private final StringBuilder pendingBuffer = new StringBuilder();
+    private final XHTMLContentHandler out;
+    private final Metadata metadata;
+    private final RTFEmbObjHandler embObjHandler;
+    // How many next ansi chars we should skip; this
+    // is 0 except when we are still in the "ansi
+    // shadow" after seeing a unicode escape, at which
+    // point it's set to the last ucN skip we had seen:
+    int ansiSkip = 0;
+    private int written = 0;
+    // Hold pending bytes (encoded in the current charset)
+    // for text output:
+    private byte[] pendingBytes = new byte[16];
+    private int pendingByteCount;
+    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+    // Holds pending chars for text output
+    private char[] pendingChars = new char[10];
+    private int pendingCharCount;
+    // Holds chars for a still-being-tokenized control word
+    private byte[] pendingControl = new byte[10];
+    private int pendingControlCount;
+    // Reused when possible:
+    private CharsetDecoder decoder;
+    private Charset lastCharset;
+    private Charset globalCharset = WINDOWS_1252;
+    private int globalDefaultFont = -1;
+    private int curFontID = -1;
+    // Current group state; in theory this initial
+    // GroupState is unused because the RTF doc should
+    // immediately open the top group (start with {):
+    private GroupState groupState = new GroupState();
+    private boolean inHeader = true;
+    private int fontTableState;
+    private int fontTableDepth;
+    // Non null if we are processing metadata (title,
+    // keywords, etc.) inside the info group:
+    private Property nextMetaData;
+    private boolean inParagraph;
+    // Non-zero if we are processing inside a field destination:
+    private int fieldState;
+    // Non-zero list index
+    private int pendingListEnd;
+    private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
+    private Map<Integer, ListDescriptor> currentListTable;
+    private ListDescriptor currentList;
+    private int listTableLevel = -1;
+    private boolean ignoreLists;
+    // Non-null if we've seen the url for a HYPERLINK but not yet
+    // its text:
+    private String pendingURL;
+    // Used to process the sub-groups inside the upr
+    // group:
+    private int uprState = -1;
+    // Used when extracting CREATION date:
+    private int year, month, day, hour, minute;
+
+    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+                         RTFEmbObjHandler embObjHandler) {
+        this.metadata = metadata;
+        this.out = out;
+        this.embObjHandler = embObjHandler;
+    }
+
+    private static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (Exception e) {
+            return ASCII;
+        }
+    }
+
+    protected static boolean isHexChar(int ch) {
+        return (ch >= '0' && ch <= '9') ||
+                (ch >= 'a' && ch <= 'f') ||
+                (ch >= 'A' && ch <= 'F');
+    }
+
+    private static boolean isAlpha(int ch) {
+        return (ch >= 'a' && ch <= 'z') ||
+                (ch >= 'A' && ch <= 'Z');
+    }
+
+    private static boolean isDigit(int ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+    protected static int hexValue(int ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            assert ch >= 'A' && ch <= 'Z';
+            return 10 + (ch - 'A');
+        }
+    }
+
+    public boolean isIgnoringLists() {
+        return ignoreLists;
+    }
+
+    public void setIgnoreLists(boolean ignore) {
+        this.ignoreLists = ignore;
+    }
+
+    // Push pending bytes or pending chars:
+    private void pushText() throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            assert pendingCharCount == 0;
+            pushBytes();
+        } else {
+            pushChars();
+        }
+    }
+
+    // Buffers the byte (unit in the current charset) for
+    // output:
+    private void addOutputByte(int b) throws IOException, SAXException, TikaException {
+        assert b >= 0 && b < 256 : "byte value out of range: " + b;
+
+        if (pendingCharCount != 0) {
+            pushChars();
+        }
+        if (groupState.pictDepth > 0) {
+            embObjHandler.writeMetadataChar((char) b);
+        } else {
+            // Save the byte in pending buffer:
+            if (pendingByteCount == pendingBytes.length) {
+                // Gradual but exponential growth:
+                final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
+                System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+                pendingBytes = newArray;
+                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+            }
+            pendingBytes[pendingByteCount++] = (byte) b;
+        }
+    }
+
+    // Buffers a byte as part of a control word:
+    private void addControl(int b) {
+        assert isAlpha(b);
+        // Save the byte in pending buffer:
+        if (pendingControlCount == pendingControl.length) {
+            // Gradual but exponential growth:
+            final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
+            System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+            pendingControl = newArray;
+        }
+        pendingControl[pendingControlCount++] = (byte) b;
+    }
+
+    // Buffers a UTF16 code unit for output
+    private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            pushBytes();
+        }
+
+        if (inHeader || fieldState == 1) {
+            pendingBuffer.append(ch);
+        } else if (groupState.sn == true || groupState.sv == true) {
+            embObjHandler.writeMetadataChar(ch);
+        } else {
+            if (pendingCharCount == pendingChars.length) {
+                // Gradual but exponential growth:
+                final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
+                System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+                pendingChars = newArray;
+            }
+            pendingChars[pendingCharCount++] = ch;
+        }
+    }
+
+    // Shallow parses the entire doc, writing output to
+    // this.out and this.metadata
+    public void extract(InputStream in) throws IOException, SAXException, TikaException {
+//        in = new FilterInputStream(in) {
+//            public int read() throws IOException {
+//                int r = super.read();
+//                System.out.write(r);
+//                System.out.flush();
+//                return r;
+//            }
+//            public int read(byte b[], int off, int len) throws IOException {
+//                int r = super.read(b, off, len);
+//                System.out.write(b, off, r);
+//                System.out.flush();
+//                return r;
+//            }
+//        };
+        extract(new PushbackInputStream(in, 2));
+    }
+
+    private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        out.startDocument();
+
+        while (true) {
+            final int b = in.read();
+            if (b == -1) {
+                break;
+            } else if (b == '\\') {
+                parseControlToken(in);
+            } else if (b == '{') {
+                pushText();
+                processGroupStart(in);
+            } else if (b == '}') {
+                pushText();
+                processGroupEnd();
+                if (groupStates.isEmpty()) {
+                    // parsed document closing brace
+                    break;
+                }
+            } else if (groupState.objdata == true ||
+                    groupState.pictDepth == 1) {
+                embObjHandler.writeHexChar(b);
+            } else if (b != '\r' && b != '\n'
+                    && (!groupState.ignore || nextMetaData != null ||
+                    groupState.sn == true || groupState.sv == true)) {
+                // Linefeed and carriage return are not
+                // significant
+                if (ansiSkip != 0) {
+                    ansiSkip--;
+                } else {
+                    addOutputByte(b);
+                }
+            }
+        }
+
+        endParagraph(false);
+        out.endDocument();
+    }
+
+    private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        int b = in.read();
+        if (b == '\'') {
+            // escaped hex char
+            parseHexChar(in);
+        } else if (isAlpha(b)) {
+            // control word
+            parseControlWord((char) b, in);
+        } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
+            // escaped char
+            addOutputByte(b);
+        } else if (b != -1) {
+            // control symbol, eg \* or \~
+            processControlSymbol((char) b);
+        }
+    }
+
+    private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
+        int hex1 = in.read();
+        if (!isHexChar(hex1)) {
+            // DOC ERROR (malformed hex escape): ignore 
+            in.unread(hex1);
+            return;
+        }
+
+        int hex2 = in.read();
+        if (!isHexChar(hex2)) {
+            // TODO: log a warning here, somehow?
+            // DOC ERROR (malformed hex escape):
+            // ignore
+            in.unread(hex2);
+            return;
+        }
+
+        if (ansiSkip != 0) {
+            // Skip this ansi char since we are
+            // still in the shadow of a unicode
+            // escape:
+            ansiSkip--;
+        } else {
+            // Unescape:
+            addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
+        }
+    }
+
+    private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
+        addControl(firstChar);
+
+        int b = in.read();
+        while (isAlpha(b)) {
+            addControl(b);
+            b = in.read();
+        }
+
+        boolean hasParam = false;
+        boolean negParam = false;
+        if (b == '-') {
+            negParam = true;
+            hasParam = true;
+            b = in.read();
+        }
+
+        int param = 0;
+        while (isDigit(b)) {
+            param *= 10;
+            param += (b - '0');
+            hasParam = true;
+            b = in.read();
+        }
+
+        // space is consumed as part of the
+        // control word, but is not added to the
+        // control word
+        if (b != ' ') {
+            in.unread(b);
+        }
+
+        if (hasParam) {
+            if (negParam) {
+                param = -param;
+            }
+            processControlWord(param, in);
+        } else {
+            processControlWord();
+        }
+
+        pendingControlCount = 0;
+    }
+
+    private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+        if (!inParagraph) {
+            // Ensure </i></b> order
+            if (groupState.italic) {
+                end("i");
+            }
+            if (groupState.bold) {
+                end("b");
+            }
+            if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
+                endList(pendingListEnd);
+                pendingListEnd = 0;
+            }
+            if (inList() && pendingListEnd != groupState.list) {
+                startList(groupState.list);
+            }
+            if (inList()) {
+                out.startElement("li");
+            } else {
+                out.startElement("p");
+            }
+
+            // Ensure <b><i> order
+            if (groupState.bold) {
+                start("b");
+            }
+            if (groupState.italic) {
+                start("i");
+            }
+            inParagraph = true;
+        }
+    }
+
+    private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+        pushText();
+        //maintain consecutive new lines
+        if (!inParagraph) {
+            lazyStartParagraph();
+        }
+        if (inParagraph) {
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = preserveStyles;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = preserveStyles;
+            }
+            if (inList()) {
+                out.endElement("li");
+            } else {
+                out.endElement("p");
+            }
+
+            if (preserveStyles && (groupState.bold || groupState.italic)) {
+                start("p");
+                if (groupState.bold) {
+                    start("b");
+                }
+                if (groupState.italic) {
+                    start("i");
+                }
+                inParagraph = true;
+            } else {
+                inParagraph = false;
+            }
+        }
+
+        // Ensure closing the list at document end
+        if (!preserveStyles && pendingListEnd != 0) {
+            endList(pendingListEnd);
+            pendingListEnd = 0;
+        }
+    }
+
+    // Push pending UTF16 units to out ContentHandler
+    private void pushChars() throws IOException, SAXException, TikaException {
+        if (pendingCharCount != 0) {
+            lazyStartParagraph();
+            out.characters(pendingChars, 0, pendingCharCount);
+            pendingCharCount = 0;
+        }
+    }
+
+    // Decodes the buffered bytes in pendingBytes
+    // into UTF16 code units, and sends the characters
+    // to the out ContentHandler, if we are in the body,
+    // else appends the characters to the pendingBuffer
+    private void pushBytes() throws IOException, SAXException, TikaException {
+        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+            final CharsetDecoder decoder = getDecoder();
+            pendingByteBuffer.limit(pendingByteCount);
+            assert pendingByteBuffer.position() == 0;
+            assert outputBuffer.position() == 0;
+
+            while (true) {
+                // We pass true for endOfInput because, when
+                // we are called, we should have seen a
+                // complete sequence of characters for this
+                // charset:
+                final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            while (true) {
+                final CoderResult result = decoder.flush(outputBuffer);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            // Reset for next decode
+            decoder.reset();
+            pendingByteBuffer.position(0);
+        }
+
+        pendingByteCount = 0;
+    }
+
+    // NOTE: s must be ascii alpha only
+    private boolean equals(String s) {
+        if (pendingControlCount != s.length()) {
+            return false;
+        }
+        for (int idx = 0; idx < pendingControlCount; idx++) {
+            assert isAlpha(s.charAt(idx));
+            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+        switch (ch) {
+            case '~':
+                // Non-breaking space -> unicode NON-BREAKING SPACE
+                addOutputChar('\u00a0');
+                break;
+            case '*':
+                // Ignorable destination (control words defined after
+                // the 1987 RTF spec). These are already handled by
+                // processGroupStart()
+                break;
+            case '-':
+                // Optional hyphen -> unicode SOFT HYPHEN
+                addOutputChar('\u00ad');
+                break;
+            case '_':
+                // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+                addOutputChar('\u2011');
+                break;
+            default:
+                break;
+        }
+    }
+
+    private CharsetDecoder getDecoder() throws TikaException {
+        Charset charset = getCharset();
+
+        // Common case: charset is same as last time, so
+        // just reuse it:
+        if (lastCharset == null || !charset.equals(lastCharset)) {
+            decoder = charset.newDecoder();
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            lastCharset = charset;
+        }
+
+        return decoder;
+    }
+
+    // Return current charset in-use
+    private Charset getCharset() throws TikaException {
+        // If a specific font (fN) was set, use its charset
+        if (groupState.fontCharset != null) {
+            return groupState.fontCharset;
+        }
+
+        // Else, if global default font (defN) was set, use that one
+        if (globalDefaultFont != -1 && !inHeader) {
+            Charset cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+
+        /

<TRUNCATED>