You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:26 UTC
[20/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index cf92406..6c86765 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -1,1423 +1,1423 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.rtf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.CodingErrorAction;
-import java.util.Calendar;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.CharsetUtils;
-import org.xml.sax.SAXException;
-
-/* Tokenizes and performs a "shallow" parse of the RTF
- * document, just enough to properly decode the text.
- *
- * TODO: we should cutover to a "real" tokenizer (eg JFlex);
- * it should give better perf, by replacing the excessive
- * "else if" string compares with FSA traversal. */
-
-final class TextExtractor {
-
- private static final Charset ASCII = Charset.forName("US-ASCII");
- private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
- private static final Charset MAC_ROMAN = getCharset("MacRoman");
- private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
- private static final Charset WINDOWS_57011 = getCharset("windows-57011");
- private static final Charset WINDOWS_57010 = getCharset("windows-57010");
- private static final Charset WINDOWS_57009 = getCharset("windows-57009");
- private static final Charset WINDOWS_57008 = getCharset("windows-57008");
- private static final Charset WINDOWS_57007 = getCharset("windows-57007");
- private static final Charset WINDOWS_57006 = getCharset("windows-57006");
- private static final Charset WINDOWS_57005 = getCharset("windows-57005");
- private static final Charset WINDOWS_57004 = getCharset("windows-57004");
- private static final Charset WINDOWS_57003 = getCharset("windows-57003");
- private static final Charset X_ISCII91 = getCharset("x-ISCII91");
- private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
- private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
- private static final Charset X_JOHAB = getCharset("x-Johab");
- private static final Charset CP12582 = getCharset("CP1258");
- private static final Charset CP12572 = getCharset("CP1257");
- private static final Charset CP12562 = getCharset("CP1256");
- private static final Charset CP12552 = getCharset("CP1255");
- private static final Charset CP12542 = getCharset("CP1254");
- private static final Charset CP12532 = getCharset("CP1253");
- private static final Charset CP1252 = getCharset("CP1252");
- private static final Charset CP12512 = getCharset("CP1251");
- private static final Charset CP12502 = getCharset("CP1250");
- private static final Charset CP950 = getCharset("CP950");
- private static final Charset CP949 = getCharset("CP949");
- private static final Charset MS9362 = getCharset("MS936");
- private static final Charset MS8742 = getCharset("MS874");
- private static final Charset CP866 = getCharset("CP866");
- private static final Charset CP865 = getCharset("CP865");
- private static final Charset CP864 = getCharset("CP864");
- private static final Charset CP863 = getCharset("CP863");
- private static final Charset CP862 = getCharset("CP862");
- private static final Charset CP860 = getCharset("CP860");
- private static final Charset CP852 = getCharset("CP852");
- private static final Charset CP8502 = getCharset("CP850");
- private static final Charset CP819 = getCharset("CP819");
- private static final Charset WINDOWS_720 = getCharset("windows-720");
- private static final Charset WINDOWS_711 = getCharset("windows-711");
- private static final Charset WINDOWS_710 = getCharset("windows-710");
- private static final Charset WINDOWS_709 = getCharset("windows-709");
- private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
- private static final Charset CP4372 = getCharset("CP437");
- private static final Charset CP850 = getCharset("cp850");
- private static final Charset CP437 = getCharset("cp437");
- private static final Charset MS874 = getCharset("ms874");
- private static final Charset CP1257 = getCharset("cp1257");
- private static final Charset CP1256 = getCharset("cp1256");
- private static final Charset CP1255 = getCharset("cp1255");
- private static final Charset CP1258 = getCharset("cp1258");
- private static final Charset CP1254 = getCharset("cp1254");
- private static final Charset CP1253 = getCharset("cp1253");
- private static final Charset MS950 = getCharset("ms950");
- private static final Charset MS936 = getCharset("ms936");
- private static final Charset MS1361 = getCharset("ms1361");
- private static final Charset MS932 = getCharset("MS932");
- private static final Charset CP1251 = getCharset("cp1251");
- private static final Charset CP1250 = getCharset("cp1250");
- private static final Charset MAC_THAI = getCharset("MacThai");
- private static final Charset MAC_TURKISH = getCharset("MacTurkish");
- private static final Charset MAC_GREEK = getCharset("MacGreek");
- private static final Charset MAC_ARABIC = getCharset("MacArabic");
- private static final Charset MAC_HEBREW = getCharset("MacHebrew");
- private static final Charset JOHAB = getCharset("johab");
- private static final Charset BIG5 = getCharset("Big5");
- private static final Charset GB2312 = getCharset("GB2312");
- private static final Charset MS949 = getCharset("ms949");
- // The RTF doc has a "font table" that assigns ords
- // (f0, f1, f2, etc.) to fonts and charsets, using the
- // \fcharsetN control word. This mapping maps from the
- // N to corresponding Java charset:
- private static final Map<Integer, Charset> FCHARSET_MAP =
- new HashMap<Integer, Charset>();
- // The RTF may specify the \ansicpgN charset in the
- // header; this maps the N to the corresponding Java
- // character set:
- private static final Map<Integer, Charset> ANSICPG_MAP =
- new HashMap<Integer, Charset>();
-
- static {
- FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
- // charset 1 is Default
- // charset 2 is Symbol
-
- FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
- FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
- FCHARSET_MAP.put(79, MS949); // Mac Hangul
- FCHARSET_MAP.put(80, GB2312); // Mac GB2312
- FCHARSET_MAP.put(81, BIG5); // Mac Big5
- FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
- FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
- FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
- FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
- FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
- FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
- FCHARSET_MAP.put(88, CP1250); // Mac East Europe
- FCHARSET_MAP.put(89, CP1251); // Mac Russian
-
- FCHARSET_MAP.put(128, MS932); // Shift JIS
- FCHARSET_MAP.put(129, MS949); // Hangul
- FCHARSET_MAP.put(130, MS1361); // Johab
- FCHARSET_MAP.put(134, MS936); // GB2312
- FCHARSET_MAP.put(136, MS950); // Big5
- FCHARSET_MAP.put(161, CP1253); // Greek
- FCHARSET_MAP.put(162, CP1254); // Turkish
- FCHARSET_MAP.put(163, CP1258); // Vietnamese
- FCHARSET_MAP.put(177, CP1255); // Hebrew
- FCHARSET_MAP.put(178, CP1256); // Arabic
- // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
- // FCHARSET_MAP.put( 180, "" ); // Arabic user
- // FCHARSET_MAP.put( 181, "" ); // Hebrew user
- FCHARSET_MAP.put(186, CP1257); // Baltic
-
- FCHARSET_MAP.put(204, CP1251); // Russian
- FCHARSET_MAP.put(222, MS874); // Thai
- FCHARSET_MAP.put(238, CP1250); // Eastern European
- FCHARSET_MAP.put(254, CP437); // PC 437
- FCHARSET_MAP.put(255, CP850); // OEM
- }
-
- static {
- ANSICPG_MAP.put(437, CP4372); // US IBM
- ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
-
- ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
- ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
- ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
- ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
-
- ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
- ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
- ANSICPG_MAP.put(852, CP852); // Eastern European
- ANSICPG_MAP.put(860, CP860); // Portuguese
- ANSICPG_MAP.put(862, CP862); // Hebrew
- ANSICPG_MAP.put(863, CP863); // French Canadian
- ANSICPG_MAP.put(864, CP864); // Arabic
- ANSICPG_MAP.put(865, CP865); // Norwegian
- ANSICPG_MAP.put(866, CP866); // Soviet Union
- ANSICPG_MAP.put(874, MS8742); // Thai
- ANSICPG_MAP.put(932, MS932); // Japanese
- ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
- ANSICPG_MAP.put(949, CP949); // Korean
- ANSICPG_MAP.put(950, CP950); // Traditional Chinese
- ANSICPG_MAP.put(1250, CP12502); // Eastern European
- ANSICPG_MAP.put(1251, CP12512); // Cyrillic
- ANSICPG_MAP.put(1252, CP1252); // Western European
- ANSICPG_MAP.put(1253, CP12532); // Greek
- ANSICPG_MAP.put(1254, CP12542); // Turkish
- ANSICPG_MAP.put(1255, CP12552); // Hebrew
- ANSICPG_MAP.put(1256, CP12562); // Arabic
- ANSICPG_MAP.put(1257, CP12572); // Baltic
- ANSICPG_MAP.put(1258, CP12582); // Vietnamese
- ANSICPG_MAP.put(1361, X_JOHAB); // Johab
- ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
- ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
- ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
- ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
- ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
- ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
- ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
- ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
- ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
-
- // TODO: in theory these other charsets are simple
- // shifts off of Devanagari, so we could impl that
- // here:
- ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
- ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
- ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
- ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
- ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
- ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
- ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
- ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
- ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
- }
-
- // Used when we decode bytes -> chars using CharsetDecoder:
- private final char[] outputArray = new char[128];
- private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
- // Holds the font table from this RTF doc, mapping
- // the font number (from \fN control word) to the
- // corresponding charset:
- private final Map<Integer, Charset> fontToCharset =
- new HashMap<Integer, Charset>();
- // Group stack: when we open a new group, we push
- // the previous group state onto the stack; when we
- // close the group, we restore it
- private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
- private final StringBuilder pendingBuffer = new StringBuilder();
- private final XHTMLContentHandler out;
- private final Metadata metadata;
- private final RTFEmbObjHandler embObjHandler;
- // How many next ansi chars we should skip; this
- // is 0 except when we are still in the "ansi
- // shadow" after seeing a unicode escape, at which
- // point it's set to the last ucN skip we had seen:
- int ansiSkip = 0;
- private int written = 0;
- // Hold pending bytes (encoded in the current charset)
- // for text output:
- private byte[] pendingBytes = new byte[16];
- private int pendingByteCount;
- private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
- // Holds pending chars for text output
- private char[] pendingChars = new char[10];
- private int pendingCharCount;
- // Holds chars for a still-being-tokenized control word
- private byte[] pendingControl = new byte[10];
- private int pendingControlCount;
- // Reused when possible:
- private CharsetDecoder decoder;
- private Charset lastCharset;
- private Charset globalCharset = WINDOWS_1252;
- private int globalDefaultFont = -1;
- private int curFontID = -1;
- // Current group state; in theory this initial
- // GroupState is unused because the RTF doc should
- // immediately open the top group (start with {):
- private GroupState groupState = new GroupState();
- private boolean inHeader = true;
- private int fontTableState;
- private int fontTableDepth;
- // Non null if we are processing metadata (title,
- // keywords, etc.) inside the info group:
- private Property nextMetaData;
- private boolean inParagraph;
- // Non-zero if we are processing inside a field destination:
- private int fieldState;
- // Non-zero list index
- private int pendingListEnd;
- private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
- private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
- private Map<Integer, ListDescriptor> currentListTable;
- private ListDescriptor currentList;
- private int listTableLevel = -1;
- private boolean ignoreLists;
- // Non-null if we've seen the url for a HYPERLINK but not yet
- // its text:
- private String pendingURL;
- // Used to process the sub-groups inside the upr
- // group:
- private int uprState = -1;
- // Used when extracting CREATION date:
- private int year, month, day, hour, minute;
-
- public TextExtractor(XHTMLContentHandler out, Metadata metadata,
- RTFEmbObjHandler embObjHandler) {
- this.metadata = metadata;
- this.out = out;
- this.embObjHandler = embObjHandler;
- }
-
- private static Charset getCharset(String name) {
- try {
- return CharsetUtils.forName(name);
- } catch (Exception e) {
- return ASCII;
- }
- }
-
- protected static boolean isHexChar(int ch) {
- return (ch >= '0' && ch <= '9') ||
- (ch >= 'a' && ch <= 'f') ||
- (ch >= 'A' && ch <= 'F');
- }
-
- private static boolean isAlpha(int ch) {
- return (ch >= 'a' && ch <= 'z') ||
- (ch >= 'A' && ch <= 'Z');
- }
-
- private static boolean isDigit(int ch) {
- return ch >= '0' && ch <= '9';
- }
-
- protected static int hexValue(int ch) {
- if (ch >= '0' && ch <= '9') {
- return ch - '0';
- } else if (ch >= 'a' && ch <= 'z') {
- return 10 + (ch - 'a');
- } else {
- assert ch >= 'A' && ch <= 'Z';
- return 10 + (ch - 'A');
- }
- }
-
- public boolean isIgnoringLists() {
- return ignoreLists;
- }
-
- public void setIgnoreLists(boolean ignore) {
- this.ignoreLists = ignore;
- }
-
- // Push pending bytes or pending chars:
- private void pushText() throws IOException, SAXException, TikaException {
- if (pendingByteCount != 0) {
- assert pendingCharCount == 0;
- pushBytes();
- } else {
- pushChars();
- }
- }
-
- // Buffers the byte (unit in the current charset) for
- // output:
- private void addOutputByte(int b) throws IOException, SAXException, TikaException {
- assert b >= 0 && b < 256 : "byte value out of range: " + b;
-
- if (pendingCharCount != 0) {
- pushChars();
- }
- if (groupState.pictDepth > 0) {
- embObjHandler.writeMetadataChar((char) b);
- } else {
- // Save the byte in pending buffer:
- if (pendingByteCount == pendingBytes.length) {
- // Gradual but exponential growth:
- final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
- System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
- pendingBytes = newArray;
- pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
- }
- pendingBytes[pendingByteCount++] = (byte) b;
- }
- }
-
- // Buffers a byte as part of a control word:
- private void addControl(int b) {
- assert isAlpha(b);
- // Save the byte in pending buffer:
- if (pendingControlCount == pendingControl.length) {
- // Gradual but exponential growth:
- final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
- System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
- pendingControl = newArray;
- }
- pendingControl[pendingControlCount++] = (byte) b;
- }
-
- // Buffers a UTF16 code unit for output
- private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
- if (pendingByteCount != 0) {
- pushBytes();
- }
-
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(ch);
- } else if (groupState.sn == true || groupState.sv == true) {
- embObjHandler.writeMetadataChar(ch);
- } else {
- if (pendingCharCount == pendingChars.length) {
- // Gradual but exponential growth:
- final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
- System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
- pendingChars = newArray;
- }
- pendingChars[pendingCharCount++] = ch;
- }
- }
-
- // Shallow parses the entire doc, writing output to
- // this.out and this.metadata
- public void extract(InputStream in) throws IOException, SAXException, TikaException {
-// in = new FilterInputStream(in) {
-// public int read() throws IOException {
-// int r = super.read();
-// System.out.write(r);
-// System.out.flush();
-// return r;
-// }
-// public int read(byte b[], int off, int len) throws IOException {
-// int r = super.read(b, off, len);
-// System.out.write(b, off, r);
-// System.out.flush();
-// return r;
-// }
-// };
- extract(new PushbackInputStream(in, 2));
- }
-
- private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
- out.startDocument();
-
- while (true) {
- final int b = in.read();
- if (b == -1) {
- break;
- } else if (b == '\\') {
- parseControlToken(in);
- } else if (b == '{') {
- pushText();
- processGroupStart(in);
- } else if (b == '}') {
- pushText();
- processGroupEnd();
- if (groupStates.isEmpty()) {
- // parsed document closing brace
- break;
- }
- } else if (groupState.objdata == true ||
- groupState.pictDepth == 1) {
- embObjHandler.writeHexChar(b);
- } else if (b != '\r' && b != '\n'
- && (!groupState.ignore || nextMetaData != null ||
- groupState.sn == true || groupState.sv == true)) {
- // Linefeed and carriage return are not
- // significant
- if (ansiSkip != 0) {
- ansiSkip--;
- } else {
- addOutputByte(b);
- }
- }
- }
-
- endParagraph(false);
- out.endDocument();
- }
-
- private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
- int b = in.read();
- if (b == '\'') {
- // escaped hex char
- parseHexChar(in);
- } else if (isAlpha(b)) {
- // control word
- parseControlWord((char) b, in);
- } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
- // escaped char
- addOutputByte(b);
- } else if (b != -1) {
- // control symbol, eg \* or \~
- processControlSymbol((char) b);
- }
- }
-
- private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
- int hex1 = in.read();
- if (!isHexChar(hex1)) {
- // DOC ERROR (malformed hex escape): ignore
- in.unread(hex1);
- return;
- }
-
- int hex2 = in.read();
- if (!isHexChar(hex2)) {
- // TODO: log a warning here, somehow?
- // DOC ERROR (malformed hex escape):
- // ignore
- in.unread(hex2);
- return;
- }
-
- if (ansiSkip != 0) {
- // Skip this ansi char since we are
- // still in the shadow of a unicode
- // escape:
- ansiSkip--;
- } else {
- // Unescape:
- addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
- }
- }
-
- private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
- addControl(firstChar);
-
- int b = in.read();
- while (isAlpha(b)) {
- addControl(b);
- b = in.read();
- }
-
- boolean hasParam = false;
- boolean negParam = false;
- if (b == '-') {
- negParam = true;
- hasParam = true;
- b = in.read();
- }
-
- int param = 0;
- while (isDigit(b)) {
- param *= 10;
- param += (b - '0');
- hasParam = true;
- b = in.read();
- }
-
- // space is consumed as part of the
- // control word, but is not added to the
- // control word
- if (b != ' ') {
- in.unread(b);
- }
-
- if (hasParam) {
- if (negParam) {
- param = -param;
- }
- processControlWord(param, in);
- } else {
- processControlWord();
- }
-
- pendingControlCount = 0;
- }
-
- private void lazyStartParagraph() throws IOException, SAXException, TikaException {
- if (!inParagraph) {
- // Ensure </i></b> order
- if (groupState.italic) {
- end("i");
- }
- if (groupState.bold) {
- end("b");
- }
- if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
- endList(pendingListEnd);
- pendingListEnd = 0;
- }
- if (inList() && pendingListEnd != groupState.list) {
- startList(groupState.list);
- }
- if (inList()) {
- out.startElement("li");
- } else {
- out.startElement("p");
- }
-
- // Ensure <b><i> order
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
- inParagraph = true;
- }
- }
-
- private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
- pushText();
- //maintain consecutive new lines
- if (!inParagraph) {
- lazyStartParagraph();
- }
- if (inParagraph) {
- if (groupState.italic) {
- end("i");
- groupState.italic = preserveStyles;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = preserveStyles;
- }
- if (inList()) {
- out.endElement("li");
- } else {
- out.endElement("p");
- }
-
- if (preserveStyles && (groupState.bold || groupState.italic)) {
- start("p");
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
- inParagraph = true;
- } else {
- inParagraph = false;
- }
- }
-
- // Ensure closing the list at document end
- if (!preserveStyles && pendingListEnd != 0) {
- endList(pendingListEnd);
- pendingListEnd = 0;
- }
- }
-
- // Push pending UTF16 units to out ContentHandler
- private void pushChars() throws IOException, SAXException, TikaException {
- if (pendingCharCount != 0) {
- lazyStartParagraph();
- out.characters(pendingChars, 0, pendingCharCount);
- pendingCharCount = 0;
- }
- }
-
- // Decodes the buffered bytes in pendingBytes
- // into UTF16 code units, and sends the characters
- // to the out ContentHandler, if we are in the body,
- // else appends the characters to the pendingBuffer
- private void pushBytes() throws IOException, SAXException, TikaException {
- if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
-
- final CharsetDecoder decoder = getDecoder();
- pendingByteBuffer.limit(pendingByteCount);
- assert pendingByteBuffer.position() == 0;
- assert outputBuffer.position() == 0;
-
- while (true) {
- // We pass true for endOfInput because, when
- // we are called, we should have seen a
- // complete sequence of characters for this
- // charset:
- final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
-
- final int pos = outputBuffer.position();
- if (pos > 0) {
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(outputArray, 0, pos);
- } else {
- lazyStartParagraph();
- out.characters(outputArray, 0, pos);
- }
- outputBuffer.position(0);
- }
-
- if (result == CoderResult.UNDERFLOW) {
- break;
- }
- }
-
- while (true) {
- final CoderResult result = decoder.flush(outputBuffer);
-
- final int pos = outputBuffer.position();
- if (pos > 0) {
- if (inHeader || fieldState == 1) {
- pendingBuffer.append(outputArray, 0, pos);
- } else {
- lazyStartParagraph();
- out.characters(outputArray, 0, pos);
- }
- outputBuffer.position(0);
- }
-
- if (result == CoderResult.UNDERFLOW) {
- break;
- }
- }
-
- // Reset for next decode
- decoder.reset();
- pendingByteBuffer.position(0);
- }
-
- pendingByteCount = 0;
- }
-
- // NOTE: s must be ascii alpha only
- private boolean equals(String s) {
- if (pendingControlCount != s.length()) {
- return false;
- }
- for (int idx = 0; idx < pendingControlCount; idx++) {
- assert isAlpha(s.charAt(idx));
- if (((byte) s.charAt(idx)) != pendingControl[idx]) {
- return false;
- }
- }
- return true;
- }
-
- private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
- switch (ch) {
- case '~':
- // Non-breaking space -> unicode NON-BREAKING SPACE
- addOutputChar('\u00a0');
- break;
- case '*':
- // Ignorable destination (control words defined after
- // the 1987 RTF spec). These are already handled by
- // processGroupStart()
- break;
- case '-':
- // Optional hyphen -> unicode SOFT HYPHEN
- addOutputChar('\u00ad');
- break;
- case '_':
- // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
- addOutputChar('\u2011');
- break;
- default:
- break;
- }
- }
-
- private CharsetDecoder getDecoder() throws TikaException {
- Charset charset = getCharset();
-
- // Common case: charset is same as last time, so
- // just reuse it:
- if (lastCharset == null || !charset.equals(lastCharset)) {
- decoder = charset.newDecoder();
- decoder.onMalformedInput(CodingErrorAction.REPLACE);
- decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
- lastCharset = charset;
- }
-
- return decoder;
- }
-
- // Return current charset in-use
- private Charset getCharset() throws TikaException {
- // If a specific font (fN) was set, use its charset
- if (groupState.fontCharset != null) {
- return groupState.fontCharset;
- }
-
- // Else, if global default font (defN) was set, use that one
- if (globalDefaultFont != -1 && !inHeader) {
- Charset cs = fontToCharset.get(globalDefaultFont);
- if (cs != null) {
- return cs;
- }
- }
-
- // Else, use the global charset
- if (globalCharset == null) {
- throw new TikaException("unable to determine charset");
- }
-
- return globalCharset;
- }
-
- // Handle control word that takes a parameter:
- private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
- // TODO: afN? (associated font number)
-
- // TODO: do these alter text output...?
- /*
- } else if (equals("stshfdbch")) {
- // font to be used by default in
- // style sheet for East Asian chars
- // arg N is font table entry
- } else if (equals("stshfloch")) {
- // font to be used by default in
- // style sheet for ASCII chars
- // arg N is font table entry
- } else if (equals("stshfhich")) {
- // font to be used by default in
- // style sheet for High Ansi chars
- // arg N is font table entry
- } else if (equals("stshfbi")) {
- // style sheet for Complex Scripts (BIDI) chars
- // arg N is font table entry
- */
-
- // TODO: inefficient that we check equals N times;
- // we'd get better perf w/ real lexer (eg
- // JFlex), which uses single-pass FSM to do cmp:
- if (inHeader) {
- if (equals("ansicpg")) {
- // ANSI codepage
- Charset cs = ANSICPG_MAP.get(param);
- if (cs != null) {
- globalCharset = cs;
- }
- } else if (equals("deff")) {
- // Default font
- globalDefaultFont = param;
- } else if (equals("nofpages")) {
- metadata.add(Office.PAGE_COUNT, Integer.toString(param));
- } else if (equals("nofwords")) {
- metadata.add(Office.WORD_COUNT, Integer.toString(param));
- } else if (equals("nofchars")) {
- metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
- } else if (equals("yr")) {
- year = param;
- } else if (equals("mo")) {
- month = param;
- } else if (equals("dy")) {
- day = param;
- } else if (equals("hr")) {
- hour = param;
- } else if (equals("min")) {
- minute = param;
- }
-
- if (fontTableState == 1) {
- // Still inside font table -- record the
- // mappings of fN to the fcharset:
- if (groupState.depth < fontTableDepth) {
- fontTableState = 2;
- } else {
- if (equals("f")) {
- // Start new font definition
- curFontID = param;
- } else if (equals("fcharset")) {
- Charset cs = FCHARSET_MAP.get(param);
- if (cs != null) {
- fontToCharset.put(curFontID, cs);
- }
- }
- }
- }
-
- if (currentList != null) {
- if (equals("listid")) {
- currentList.id = param;
- currentListTable.put(currentList.id, currentList);
- } else if (equals("listtemplateid")) {
- currentList.templateID = param;
- } else if (equals("levelnfc") || equals("levelnfcn")) {
- //sanity check to make sure list information isn't corrupt
- if (listTableLevel > -1 &&
- listTableLevel < currentList.numberType.length) {
- currentList.numberType[listTableLevel] = param;
- }
- }
- }
- } else {
- // In document
- if (equals("b")) {
- // b0
- assert param == 0;
- if (groupState.bold) {
- pushText();
- if (groupState.italic) {
- end("i");
- }
- end("b");
- if (groupState.italic) {
- start("i");
- }
- groupState.bold = false;
- }
- } else if (equals("i")) {
- // i0
- assert param == 0;
- if (groupState.italic) {
- pushText();
- end("i");
- groupState.italic = false;
- }
- } else if (equals("f")) {
- // Change current font
- Charset fontCharset = fontToCharset.get(param);
-
- // Push any buffered text before changing
- // font:
- pushText();
-
- if (fontCharset != null) {
- groupState.fontCharset = fontCharset;
- } else {
- // DOC ERROR: font change referenced a
- // non-table'd font number
- // TODO: log a warning? Throw an exc?
- groupState.fontCharset = null;
- }
- } else if (equals("ls")) {
- groupState.list = param;
- } else if (equals("lslvl")) {
- groupState.listLevel = param;
- }
- }
-
- // Process unicode escape. This can appear in doc
- // or in header, since the metadata (info) fields
- // in the header can be unicode escaped as well:
- if (equals("u")) {
- // Unicode escape
- if (!groupState.ignore || groupState.sv || groupState.sn) {
- final char utf16CodeUnit = (char) (param & 0xffff);
- addOutputChar(utf16CodeUnit);
- }
-
- // After seeing a unicode escape we must
- // skip the next ucSkip ansi chars (the
- // "unicode shadow")
- ansiSkip = groupState.ucSkip;
- } else if (equals("uc")) {
- // Change unicode shadow length
- groupState.ucSkip = param;
- } else if (equals("bin")) {
- if (param >= 0) {
- if (groupState.pictDepth == 1) {
- try {
- embObjHandler.writeBytes(in, param);
- } catch (IOException e) {
- //param was out of bounds or something went wrong during writing.
- //skip this obj and move on
- //TODO: log.warn
- embObjHandler.reset();
- }
- } else {
- IOUtils.skipFully(in, param);
- }
- } else {
- // log some warning?
- }
- }
- }
-
- private boolean inList() {
- return !ignoreLists && groupState.list != 0;
- }
-
- /**
- * Marks the current list as pending to end. This is done to be able to merge list items of
- * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
- * <code>"ol"</code>).
- */
- private void pendingListEnd() {
- pendingListEnd = groupState.list;
- groupState.list = 0;
- }
-
- /**
- * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
- * type for the given <code>listID</code>.
- *
- * @param listID The ID of the list.
- * @throws IOException
- * @throws SAXException
- * @throws TikaException
- */
- private void endList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
- out.endElement(isUnorderedList(listID) ? "ul" : "ol");
- }
- }
-
- /**
- * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
- * type for the given <code>listID</code>.
- *
- * @param listID The ID of the list.
- * @throws IOException
- * @throws SAXException
- * @throws TikaException
- */
- private void startList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
- out.startElement(isUnorderedList(listID) ? "ul" : "ol");
- }
- }
-
- private boolean isUnorderedList(int listID) {
- ListDescriptor list = listTable.get(listID);
- if (list != null) {
- return list.isUnordered(groupState.listLevel);
- }
- return true;
- }
-
- private void end(String tag) throws IOException, SAXException, TikaException {
- out.endElement(tag);
- }
-
- private void start(String tag) throws IOException, SAXException, TikaException {
- out.startElement(tag);
- }
-
- // Handle non-parameter control word:
- private void processControlWord() throws IOException, SAXException, TikaException {
- if (inHeader) {
- if (equals("ansi")) {
- globalCharset = WINDOWS_1252;
- } else if (equals("pca")) {
- globalCharset = CP850;
- } else if (equals("pc")) {
- globalCharset = CP437;
- } else if (equals("mac")) {
- globalCharset = MAC_ROMAN;
- }
-
- if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
- groupState.ignore = true;
- } else if (equals("listtable")) {
- currentListTable = listTable;
- } else if (equals("listoverridetable")) {
- currentListTable = listOverrideTable;
- }
-
- if (uprState == -1) {
- // TODO: we can also parse \creatim, \revtim,
- // \printim, \version, etc.
- if (equals("author")) {
- nextMetaData = TikaCoreProperties.CREATOR;
- } else if (equals("title")) {
- nextMetaData = TikaCoreProperties.TITLE;
- } else if (equals("subject")) {
- // TODO: Move to OO subject in Tika 2.0
- nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
- } else if (equals("keywords")) {
- nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
- } else if (equals("category")) {
- nextMetaData = OfficeOpenXMLCore.CATEGORY;
- } else if (equals("comment")) {
- nextMetaData = TikaCoreProperties.COMMENTS;
- } else if (equals("company")) {
- nextMetaData = OfficeOpenXMLExtended.COMPANY;
- } else if (equals("manager")) {
- nextMetaData = OfficeOpenXMLExtended.MANAGER;
- } else if (equals("template")) {
- nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
- } else if (equals("creatim")) {
- nextMetaData = TikaCoreProperties.CREATED;
- }
- }
-
- if (fontTableState == 0) {
- // Didn't see font table yet
- if (equals("fonttbl")) {
- fontTableState = 1;
- fontTableDepth = groupState.depth;
- }
- } else if (fontTableState == 1) {
- // Inside font table
- if (groupState.depth < fontTableDepth) {
- fontTableState = 2;
- }
- }
-
- // List table handling
- if (currentListTable != null) {
- if (equals("list") || equals("listoverride")) {
- currentList = new ListDescriptor();
- listTableLevel = -1;
- } else if (currentList != null) {
- if (equals("liststylename")) {
- currentList.isStyle = true;
- } else if (equals("listlevel")) {
- listTableLevel++;
- }
- }
- }
-
- if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
- inHeader = false;
- }
- } else {
- if (equals("b")) {
- if (!groupState.bold) {
- pushText();
- lazyStartParagraph();
- if (groupState.italic) {
- // Make sure nesting is always <b><i>
- end("i");
- }
- groupState.bold = true;
- start("b");
- if (groupState.italic) {
- start("i");
- }
- }
- } else if (equals("i")) {
- if (!groupState.italic) {
- pushText();
- lazyStartParagraph();
- groupState.italic = true;
- start("i");
- }
- }
- }
-
- final boolean ignored = groupState.ignore;
-
- if (equals("pard")) {
- // Reset styles
- pushText();
- if (groupState.italic) {
- end("i");
- groupState.italic = false;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = false;
- }
- if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
- pendingListEnd();
- }
- } else if (equals("par")) {
- if (!ignored) {
- endParagraph(true);
- }
- } else if (equals("shptxt")) {
- pushText();
- // Text inside a shape
- groupState.ignore = false;
- } else if (equals("atnid")) {
- pushText();
- // Annotation ID
- groupState.ignore = false;
- } else if (equals("atnauthor")) {
- pushText();
- // Annotation author
- groupState.ignore = false;
- } else if (equals("annotation")) {
- pushText();
- // Annotation
- groupState.ignore = false;
- } else if (equals("listtext")) {
- groupState.ignore = true;
- } else if (equals("cell")) {
- // TODO: we should produce a table output here?
- //addOutputChar(' ');
- endParagraph(true);
- } else if (equals("sp")) {
- groupState.sp = true;
- } else if (equals("sn")) {
- embObjHandler.startSN();
- groupState.sn = true;
- } else if (equals("sv")) {
- embObjHandler.startSV();
- groupState.sv = true;
- } else if (equals("object")) {
- pushText();
- embObjHandler.setInObject(true);
- groupState.object = true;
- } else if (equals("objdata")) {
- groupState.objdata = true;
- embObjHandler.startObjData();
- } else if (equals("pict")) {
- pushText();
- // TODO: create img tag? but can that support
- // embedded image data?
- groupState.pictDepth = 1;
- embObjHandler.startPict();
- } else if (equals("line")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("column")) {
- if (!ignored) {
- addOutputChar(' ');
- }
- } else if (equals("page")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("softline")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("softcolumn")) {
- if (!ignored) {
- addOutputChar(' ');
- }
- } else if (equals("softpage")) {
- if (!ignored) {
- addOutputChar('\n');
- }
- } else if (equals("tab")) {
- if (!ignored) {
- addOutputChar('\t');
- }
- } else if (equals("upr")) {
- uprState = 0;
- } else if (equals("ud") && uprState == 1) {
- uprState = -1;
- // 2nd group inside the upr destination, which
- // contains the unicode encoding of the text, so
- // we want to keep that:
- groupState.ignore = false;
- } else if (equals("bullet")) {
- if (!ignored) {
- // unicode BULLET
- addOutputChar('\u2022');
- }
- } else if (equals("endash")) {
- if (!ignored) {
- // unicode EN DASH
- addOutputChar('\u2013');
- }
- } else if (equals("emdash")) {
- if (!ignored) {
- // unicode EM DASH
- addOutputChar('\u2014');
- }
- } else if (equals("enspace")) {
- if (!ignored) {
- // unicode EN SPACE
- addOutputChar('\u2002');
- }
- } else if (equals("qmspace")) {
- if (!ignored) {
- // quarter em space -> unicode FOUR-PER-EM SPACE
- addOutputChar('\u2005');
- }
- } else if (equals("emspace")) {
- if (!ignored) {
- // unicode EM SPACE
- addOutputChar('\u2003');
- }
- } else if (equals("lquote")) {
- if (!ignored) {
- // unicode LEFT SINGLE QUOTATION MARK
- addOutputChar('\u2018');
- }
- } else if (equals("rquote")) {
- if (!ignored) {
- // unicode RIGHT SINGLE QUOTATION MARK
- addOutputChar('\u2019');
- }
- } else if (equals("ldblquote")) {
- if (!ignored) {
- // unicode LEFT DOUBLE QUOTATION MARK
- addOutputChar('\u201C');
- }
- } else if (equals("rdblquote")) {
- if (!ignored) {
- // unicode RIGHT DOUBLE QUOTATION MARK
- addOutputChar('\u201D');
- }
- } else if (equals("fldinst")) {
- fieldState = 1;
- groupState.ignore = false;
- } else if (equals("fldrslt") && fieldState == 2) {
- assert pendingURL != null;
- lazyStartParagraph();
- out.startElement("a", "href", pendingURL);
- pendingURL = null;
- fieldState = 3;
- groupState.ignore = false;
- }
- }
-
- // Push new GroupState
- private void processGroupStart(PushbackInputStream in) throws IOException {
- ansiSkip = 0;
- // Push current groupState onto the stack
- groupStates.add(groupState);
-
- // Make new GroupState
- groupState = new GroupState(groupState);
- assert groupStates.size() == groupState.depth : "size=" + groupStates.size() + " depth=" + groupState.depth;
-
- if (uprState == 0) {
- uprState = 1;
- groupState.ignore = true;
- }
-
- // Check for ignorable groups. Note that
- // sometimes we un-ignore within this group, eg
- // when handling upr escape.
- int b2 = in.read();
- if (b2 == '\\') {
- int b3 = in.read();
- if (b3 == '*') {
- groupState.ignore = true;
- }
- in.unread(b3);
- }
- in.unread(b2);
- }
-
- // Pop current GroupState
- private void processGroupEnd() throws IOException, SAXException, TikaException {
- if (inHeader) {
- if (nextMetaData != null) {
- if (nextMetaData == TikaCoreProperties.CREATED) {
- Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
- cal.set(year, month - 1, day, hour, minute, 0);
- metadata.set(nextMetaData, cal.getTime());
- } else if (nextMetaData.isMultiValuePermitted()) {
- metadata.add(nextMetaData, pendingBuffer.toString());
- } else {
- metadata.set(nextMetaData, pendingBuffer.toString());
- }
- nextMetaData = null;
- }
- pendingBuffer.setLength(0);
- }
-
- assert groupState.depth > 0;
- ansiSkip = 0;
-
- if (groupState.objdata == true) {
- embObjHandler.handleCompletedObject();
- groupState.objdata = false;
- } else if (groupState.pictDepth > 0) {
- if (groupState.sn == true) {
- embObjHandler.endSN();
- } else if (groupState.sv == true) {
- embObjHandler.endSV();
- } else if (groupState.sp == true) {
- embObjHandler.endSP();
- } else if (groupState.pictDepth == 1) {
- embObjHandler.handleCompletedObject();
- }
- }
-
- if (groupState.object == true) {
- embObjHandler.setInObject(false);
- }
-
- // Be robust if RTF doc is corrupt (has too many
- // closing }s):
- // TODO: log a warning?
- if (groupStates.size() > 0) {
- // Restore group state:
- final GroupState outerGroupState = groupStates.removeLast();
-
- // Close italic, if outer does not have italic or
- // bold changed:
- if (groupState.italic) {
- if (!outerGroupState.italic ||
- groupState.bold != outerGroupState.bold) {
- end("i");
- groupState.italic = false;
- }
- }
-
- // Close bold
- if (groupState.bold && !outerGroupState.bold) {
- end("b");
- }
-
- // Open bold
- if (!groupState.bold && outerGroupState.bold) {
- start("b");
- }
-
- // Open italic
- if (!groupState.italic && outerGroupState.italic) {
- start("i");
- }
- groupState = outerGroupState;
- }
- assert groupStates.size() == groupState.depth;
-
- if (fieldState == 1) {
- String s = pendingBuffer.toString().trim();
- pendingBuffer.setLength(0);
- if (s.startsWith("HYPERLINK")) {
- s = s.substring(9).trim();
- // TODO: what other instructions can be in a
- // HYPERLINK destination?
- final boolean isLocalLink = s.contains("\\l ");
- int idx = s.indexOf('"');
- if (idx != -1) {
- int idx2 = s.indexOf('"', 1 + idx);
- if (idx2 != -1) {
- s = s.substring(1 + idx, idx2);
- }
- }
- pendingURL = (isLocalLink ? "#" : "") + s;
- fieldState = 2;
- } else {
- fieldState = 0;
- }
-
- // TODO: we could process the other known field
- // types. Right now, we will extract their text
- // inlined, but fail to record them in metadata
- // as a field value.
- } else if (fieldState == 3) {
- out.endElement("a");
- fieldState = 0;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+ private static final Charset ASCII = Charset.forName("US-ASCII");
+ private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
+ private static final Charset MAC_ROMAN = getCharset("MacRoman");
+ private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
+ private static final Charset WINDOWS_57011 = getCharset("windows-57011");
+ private static final Charset WINDOWS_57010 = getCharset("windows-57010");
+ private static final Charset WINDOWS_57009 = getCharset("windows-57009");
+ private static final Charset WINDOWS_57008 = getCharset("windows-57008");
+ private static final Charset WINDOWS_57007 = getCharset("windows-57007");
+ private static final Charset WINDOWS_57006 = getCharset("windows-57006");
+ private static final Charset WINDOWS_57005 = getCharset("windows-57005");
+ private static final Charset WINDOWS_57004 = getCharset("windows-57004");
+ private static final Charset WINDOWS_57003 = getCharset("windows-57003");
+ private static final Charset X_ISCII91 = getCharset("x-ISCII91");
+ private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
+ private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
+ private static final Charset X_JOHAB = getCharset("x-Johab");
+ private static final Charset CP12582 = getCharset("CP1258");
+ private static final Charset CP12572 = getCharset("CP1257");
+ private static final Charset CP12562 = getCharset("CP1256");
+ private static final Charset CP12552 = getCharset("CP1255");
+ private static final Charset CP12542 = getCharset("CP1254");
+ private static final Charset CP12532 = getCharset("CP1253");
+ private static final Charset CP1252 = getCharset("CP1252");
+ private static final Charset CP12512 = getCharset("CP1251");
+ private static final Charset CP12502 = getCharset("CP1250");
+ private static final Charset CP950 = getCharset("CP950");
+ private static final Charset CP949 = getCharset("CP949");
+ private static final Charset MS9362 = getCharset("MS936");
+ private static final Charset MS8742 = getCharset("MS874");
+ private static final Charset CP866 = getCharset("CP866");
+ private static final Charset CP865 = getCharset("CP865");
+ private static final Charset CP864 = getCharset("CP864");
+ private static final Charset CP863 = getCharset("CP863");
+ private static final Charset CP862 = getCharset("CP862");
+ private static final Charset CP860 = getCharset("CP860");
+ private static final Charset CP852 = getCharset("CP852");
+ private static final Charset CP8502 = getCharset("CP850");
+ private static final Charset CP819 = getCharset("CP819");
+ private static final Charset WINDOWS_720 = getCharset("windows-720");
+ private static final Charset WINDOWS_711 = getCharset("windows-711");
+ private static final Charset WINDOWS_710 = getCharset("windows-710");
+ private static final Charset WINDOWS_709 = getCharset("windows-709");
+ private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
+ private static final Charset CP4372 = getCharset("CP437");
+ private static final Charset CP850 = getCharset("cp850");
+ private static final Charset CP437 = getCharset("cp437");
+ private static final Charset MS874 = getCharset("ms874");
+ private static final Charset CP1257 = getCharset("cp1257");
+ private static final Charset CP1256 = getCharset("cp1256");
+ private static final Charset CP1255 = getCharset("cp1255");
+ private static final Charset CP1258 = getCharset("cp1258");
+ private static final Charset CP1254 = getCharset("cp1254");
+ private static final Charset CP1253 = getCharset("cp1253");
+ private static final Charset MS950 = getCharset("ms950");
+ private static final Charset MS936 = getCharset("ms936");
+ private static final Charset MS1361 = getCharset("ms1361");
+ private static final Charset MS932 = getCharset("MS932");
+ private static final Charset CP1251 = getCharset("cp1251");
+ private static final Charset CP1250 = getCharset("cp1250");
+ private static final Charset MAC_THAI = getCharset("MacThai");
+ private static final Charset MAC_TURKISH = getCharset("MacTurkish");
+ private static final Charset MAC_GREEK = getCharset("MacGreek");
+ private static final Charset MAC_ARABIC = getCharset("MacArabic");
+ private static final Charset MAC_HEBREW = getCharset("MacHebrew");
+ private static final Charset JOHAB = getCharset("johab");
+ private static final Charset BIG5 = getCharset("Big5");
+ private static final Charset GB2312 = getCharset("GB2312");
+ private static final Charset MS949 = getCharset("ms949");
+ // The RTF doc has a "font table" that assigns ords
+ // (f0, f1, f2, etc.) to fonts and charsets, using the
+ // \fcharsetN control word. This mapping maps from the
+ // N to corresponding Java charset:
+ private static final Map<Integer, Charset> FCHARSET_MAP =
+ new HashMap<Integer, Charset>();
+ // The RTF may specify the \ansicpgN charset in the
+ // header; this maps the N to the corresponding Java
+ // character set:
+ private static final Map<Integer, Charset> ANSICPG_MAP =
+ new HashMap<Integer, Charset>();
+
+ static {
+ FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
+ // charset 1 is Default
+ // charset 2 is Symbol
+
+ FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
+ FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
+ FCHARSET_MAP.put(79, MS949); // Mac Hangul
+ FCHARSET_MAP.put(80, GB2312); // Mac GB2312
+ FCHARSET_MAP.put(81, BIG5); // Mac Big5
+ FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
+ FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
+ FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
+ FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
+ FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
+ FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
+ FCHARSET_MAP.put(88, CP1250); // Mac East Europe
+ FCHARSET_MAP.put(89, CP1251); // Mac Russian
+
+ FCHARSET_MAP.put(128, MS932); // Shift JIS
+ FCHARSET_MAP.put(129, MS949); // Hangul
+ FCHARSET_MAP.put(130, MS1361); // Johab
+ FCHARSET_MAP.put(134, MS936); // GB2312
+ FCHARSET_MAP.put(136, MS950); // Big5
+ FCHARSET_MAP.put(161, CP1253); // Greek
+ FCHARSET_MAP.put(162, CP1254); // Turkish
+ FCHARSET_MAP.put(163, CP1258); // Vietnamese
+ FCHARSET_MAP.put(177, CP1255); // Hebrew
+ FCHARSET_MAP.put(178, CP1256); // Arabic
+ // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+ // FCHARSET_MAP.put( 180, "" ); // Arabic user
+ // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+ FCHARSET_MAP.put(186, CP1257); // Baltic
+
+ FCHARSET_MAP.put(204, CP1251); // Russian
+ FCHARSET_MAP.put(222, MS874); // Thai
+ FCHARSET_MAP.put(238, CP1250); // Eastern European
+ FCHARSET_MAP.put(254, CP437); // PC 437
+ FCHARSET_MAP.put(255, CP850); // OEM
+ }
+
+ static {
+ ANSICPG_MAP.put(437, CP4372); // US IBM
+ ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
+
+ ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
+ ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
+ ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
+ ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+
+ ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
+ ANSICPG_MAP.put(852, CP852); // Eastern European
+ ANSICPG_MAP.put(860, CP860); // Portuguese
+ ANSICPG_MAP.put(862, CP862); // Hebrew
+ ANSICPG_MAP.put(863, CP863); // French Canadian
+ ANSICPG_MAP.put(864, CP864); // Arabic
+ ANSICPG_MAP.put(865, CP865); // Norwegian
+ ANSICPG_MAP.put(866, CP866); // Soviet Union
+ ANSICPG_MAP.put(874, MS8742); // Thai
+ ANSICPG_MAP.put(932, MS932); // Japanese
+ ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
+ ANSICPG_MAP.put(949, CP949); // Korean
+ ANSICPG_MAP.put(950, CP950); // Traditional Chinese
+ ANSICPG_MAP.put(1250, CP12502); // Eastern European
+ ANSICPG_MAP.put(1251, CP12512); // Cyrillic
+ ANSICPG_MAP.put(1252, CP1252); // Western European
+ ANSICPG_MAP.put(1253, CP12532); // Greek
+ ANSICPG_MAP.put(1254, CP12542); // Turkish
+ ANSICPG_MAP.put(1255, CP12552); // Hebrew
+ ANSICPG_MAP.put(1256, CP12562); // Arabic
+ ANSICPG_MAP.put(1257, CP12572); // Baltic
+ ANSICPG_MAP.put(1258, CP12582); // Vietnamese
+ ANSICPG_MAP.put(1361, X_JOHAB); // Johab
+ ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
+ ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
+ ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
+ ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
+ ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
+ ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
+ ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
+ ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
+ ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
+
+ // TODO: in theory these other charsets are simple
+ // shifts off of Devanagari, so we could impl that
+ // here:
+ ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
+ ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
+ ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
+ ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
+ ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
+ ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
+ ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
+ ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
+ ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
+ }
+
+ // Used when we decode bytes -> chars using CharsetDecoder:
+ private final char[] outputArray = new char[128];
+ private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+ // Holds the font table from this RTF doc, mapping
+ // the font number (from \fN control word) to the
+ // corresponding charset:
+ private final Map<Integer, Charset> fontToCharset =
+ new HashMap<Integer, Charset>();
+ // Group stack: when we open a new group, we push
+ // the previous group state onto the stack; when we
+ // close the group, we restore it
+ private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+ private final StringBuilder pendingBuffer = new StringBuilder();
+ private final XHTMLContentHandler out;
+ private final Metadata metadata;
+ private final RTFEmbObjHandler embObjHandler;
+ // How many next ansi chars we should skip; this
+ // is 0 except when we are still in the "ansi
+ // shadow" after seeing a unicode escape, at which
+ // point it's set to the last ucN skip we had seen:
+ int ansiSkip = 0;
+ private int written = 0;
+ // Hold pending bytes (encoded in the current charset)
+ // for text output:
+ private byte[] pendingBytes = new byte[16];
+ private int pendingByteCount;
+ private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+ // Holds pending chars for text output
+ private char[] pendingChars = new char[10];
+ private int pendingCharCount;
+ // Holds chars for a still-being-tokenized control word
+ private byte[] pendingControl = new byte[10];
+ private int pendingControlCount;
+ // Reused when possible:
+ private CharsetDecoder decoder;
+ private Charset lastCharset;
+ private Charset globalCharset = WINDOWS_1252;
+ private int globalDefaultFont = -1;
+ private int curFontID = -1;
+ // Current group state; in theory this initial
+ // GroupState is unused because the RTF doc should
+ // immediately open the top group (start with {):
+ private GroupState groupState = new GroupState();
+ private boolean inHeader = true;
+ private int fontTableState;
+ private int fontTableDepth;
+ // Non null if we are processing metadata (title,
+ // keywords, etc.) inside the info group:
+ private Property nextMetaData;
+ private boolean inParagraph;
+ // Non-zero if we are processing inside a field destination:
+ private int fieldState;
+ // Non-zero list index
+ private int pendingListEnd;
+ private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
+ private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
+ private Map<Integer, ListDescriptor> currentListTable;
+ private ListDescriptor currentList;
+ private int listTableLevel = -1;
+ private boolean ignoreLists;
+ // Non-null if we've seen the url for a HYPERLINK but not yet
+ // its text:
+ private String pendingURL;
+ // Used to process the sub-groups inside the upr
+ // group:
+ private int uprState = -1;
+ // Used when extracting CREATION date:
+ private int year, month, day, hour, minute;
+
+ public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+ RTFEmbObjHandler embObjHandler) {
+ this.metadata = metadata;
+ this.out = out;
+ this.embObjHandler = embObjHandler;
+ }
+
+ private static Charset getCharset(String name) {
+ try {
+ return CharsetUtils.forName(name);
+ } catch (Exception e) {
+ return ASCII;
+ }
+ }
+
+ protected static boolean isHexChar(int ch) {
+ return (ch >= '0' && ch <= '9') ||
+ (ch >= 'a' && ch <= 'f') ||
+ (ch >= 'A' && ch <= 'F');
+ }
+
+ private static boolean isAlpha(int ch) {
+ return (ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z');
+ }
+
+ private static boolean isDigit(int ch) {
+ return ch >= '0' && ch <= '9';
+ }
+
+ protected static int hexValue(int ch) {
+ if (ch >= '0' && ch <= '9') {
+ return ch - '0';
+ } else if (ch >= 'a' && ch <= 'z') {
+ return 10 + (ch - 'a');
+ } else {
+ assert ch >= 'A' && ch <= 'Z';
+ return 10 + (ch - 'A');
+ }
+ }
+
+ public boolean isIgnoringLists() {
+ return ignoreLists;
+ }
+
+ public void setIgnoreLists(boolean ignore) {
+ this.ignoreLists = ignore;
+ }
+
+ // Push pending bytes or pending chars:
+ private void pushText() throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ assert pendingCharCount == 0;
+ pushBytes();
+ } else {
+ pushChars();
+ }
+ }
+
+ // Buffers the byte (unit in the current charset) for
+ // output:
+ private void addOutputByte(int b) throws IOException, SAXException, TikaException {
+ assert b >= 0 && b < 256 : "byte value out of range: " + b;
+
+ if (pendingCharCount != 0) {
+ pushChars();
+ }
+ if (groupState.pictDepth > 0) {
+ embObjHandler.writeMetadataChar((char) b);
+ } else {
+ // Save the byte in pending buffer:
+ if (pendingByteCount == pendingBytes.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
+ System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+ pendingBytes = newArray;
+ pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+ }
+ pendingBytes[pendingByteCount++] = (byte) b;
+ }
+ }
+
+ // Buffers a byte as part of a control word:
+ private void addControl(int b) {
+ assert isAlpha(b);
+ // Save the byte in pending buffer:
+ if (pendingControlCount == pendingControl.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
+ System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+ pendingControl = newArray;
+ }
+ pendingControl[pendingControlCount++] = (byte) b;
+ }
+
+ // Buffers a UTF16 code unit for output
+ private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ pushBytes();
+ }
+
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(ch);
+ } else if (groupState.sn == true || groupState.sv == true) {
+ embObjHandler.writeMetadataChar(ch);
+ } else {
+ if (pendingCharCount == pendingChars.length) {
+ // Gradual but exponential growth:
+ final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
+ System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+ pendingChars = newArray;
+ }
+ pendingChars[pendingCharCount++] = ch;
+ }
+ }
+
+ // Shallow parses the entire doc, writing output to
+ // this.out and this.metadata
+ public void extract(InputStream in) throws IOException, SAXException, TikaException {
+// in = new FilterInputStream(in) {
+// public int read() throws IOException {
+// int r = super.read();
+// System.out.write(r);
+// System.out.flush();
+// return r;
+// }
+// public int read(byte b[], int off, int len) throws IOException {
+// int r = super.read(b, off, len);
+// System.out.write(b, off, r);
+// System.out.flush();
+// return r;
+// }
+// };
+ extract(new PushbackInputStream(in, 2));
+ }
+
+ private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ out.startDocument();
+
+ while (true) {
+ final int b = in.read();
+ if (b == -1) {
+ break;
+ } else if (b == '\\') {
+ parseControlToken(in);
+ } else if (b == '{') {
+ pushText();
+ processGroupStart(in);
+ } else if (b == '}') {
+ pushText();
+ processGroupEnd();
+ if (groupStates.isEmpty()) {
+ // parsed document closing brace
+ break;
+ }
+ } else if (groupState.objdata == true ||
+ groupState.pictDepth == 1) {
+ embObjHandler.writeHexChar(b);
+ } else if (b != '\r' && b != '\n'
+ && (!groupState.ignore || nextMetaData != null ||
+ groupState.sn == true || groupState.sv == true)) {
+ // Linefeed and carriage return are not
+ // significant
+ if (ansiSkip != 0) {
+ ansiSkip--;
+ } else {
+ addOutputByte(b);
+ }
+ }
+ }
+
+ endParagraph(false);
+ out.endDocument();
+ }
+
+ private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ int b = in.read();
+ if (b == '\'') {
+ // escaped hex char
+ parseHexChar(in);
+ } else if (isAlpha(b)) {
+ // control word
+ parseControlWord((char) b, in);
+ } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
+ // escaped char
+ addOutputByte(b);
+ } else if (b != -1) {
+ // control symbol, eg \* or \~
+ processControlSymbol((char) b);
+ }
+ }
+
+ private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
+ int hex1 = in.read();
+ if (!isHexChar(hex1)) {
+ // DOC ERROR (malformed hex escape): ignore
+ in.unread(hex1);
+ return;
+ }
+
+ int hex2 = in.read();
+ if (!isHexChar(hex2)) {
+ // TODO: log a warning here, somehow?
+ // DOC ERROR (malformed hex escape):
+ // ignore
+ in.unread(hex2);
+ return;
+ }
+
+ if (ansiSkip != 0) {
+ // Skip this ansi char since we are
+ // still in the shadow of a unicode
+ // escape:
+ ansiSkip--;
+ } else {
+ // Unescape:
+ addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
+ }
+ }
+
+ private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
+ addControl(firstChar);
+
+ int b = in.read();
+ while (isAlpha(b)) {
+ addControl(b);
+ b = in.read();
+ }
+
+ boolean hasParam = false;
+ boolean negParam = false;
+ if (b == '-') {
+ negParam = true;
+ hasParam = true;
+ b = in.read();
+ }
+
+ int param = 0;
+ while (isDigit(b)) {
+ param *= 10;
+ param += (b - '0');
+ hasParam = true;
+ b = in.read();
+ }
+
+ // space is consumed as part of the
+ // control word, but is not added to the
+ // control word
+ if (b != ' ') {
+ in.unread(b);
+ }
+
+ if (hasParam) {
+ if (negParam) {
+ param = -param;
+ }
+ processControlWord(param, in);
+ } else {
+ processControlWord();
+ }
+
+ pendingControlCount = 0;
+ }
+
+ private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+ if (!inParagraph) {
+ // Ensure </i></b> order
+ if (groupState.italic) {
+ end("i");
+ }
+ if (groupState.bold) {
+ end("b");
+ }
+ if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
+ endList(pendingListEnd);
+ pendingListEnd = 0;
+ }
+ if (inList() && pendingListEnd != groupState.list) {
+ startList(groupState.list);
+ }
+ if (inList()) {
+ out.startElement("li");
+ } else {
+ out.startElement("p");
+ }
+
+ // Ensure <b><i> order
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ }
+ }
+
+ private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+ pushText();
+ //maintain consecutive new lines
+ if (!inParagraph) {
+ lazyStartParagraph();
+ }
+ if (inParagraph) {
+ if (groupState.italic) {
+ end("i");
+ groupState.italic = preserveStyles;
+ }
+ if (groupState.bold) {
+ end("b");
+ groupState.bold = preserveStyles;
+ }
+ if (inList()) {
+ out.endElement("li");
+ } else {
+ out.endElement("p");
+ }
+
+ if (preserveStyles && (groupState.bold || groupState.italic)) {
+ start("p");
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ } else {
+ inParagraph = false;
+ }
+ }
+
+ // Ensure closing the list at document end
+ if (!preserveStyles && pendingListEnd != 0) {
+ endList(pendingListEnd);
+ pendingListEnd = 0;
+ }
+ }
+
+ // Push pending UTF16 units to out ContentHandler
+ private void pushChars() throws IOException, SAXException, TikaException {
+ if (pendingCharCount != 0) {
+ lazyStartParagraph();
+ out.characters(pendingChars, 0, pendingCharCount);
+ pendingCharCount = 0;
+ }
+ }
+
+ // Decodes the buffered bytes in pendingBytes
+ // into UTF16 code units, and sends the characters
+ // to the out ContentHandler, if we are in the body,
+ // else appends the characters to the pendingBuffer
+ private void pushBytes() throws IOException, SAXException, TikaException {
+ if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+ final CharsetDecoder decoder = getDecoder();
+ pendingByteBuffer.limit(pendingByteCount);
+ assert pendingByteBuffer.position() == 0;
+ assert outputBuffer.position() == 0;
+
+ while (true) {
+ // We pass true for endOfInput because, when
+ // we are called, we should have seen a
+ // complete sequence of characters for this
+ // charset:
+ final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ while (true) {
+ final CoderResult result = decoder.flush(outputBuffer);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ // Reset for next decode
+ decoder.reset();
+ pendingByteBuffer.position(0);
+ }
+
+ pendingByteCount = 0;
+ }
+
+ // NOTE: s must be ascii alpha only
+ private boolean equals(String s) {
+ if (pendingControlCount != s.length()) {
+ return false;
+ }
+ for (int idx = 0; idx < pendingControlCount; idx++) {
+ assert isAlpha(s.charAt(idx));
+ if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+ switch (ch) {
+ case '~':
+ // Non-breaking space -> unicode NON-BREAKING SPACE
+ addOutputChar('\u00a0');
+ break;
+ case '*':
+ // Ignorable destination (control words defined after
+ // the 1987 RTF spec). These are already handled by
+ // processGroupStart()
+ break;
+ case '-':
+ // Optional hyphen -> unicode SOFT HYPHEN
+ addOutputChar('\u00ad');
+ break;
+ case '_':
+ // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+ addOutputChar('\u2011');
+ break;
+ default:
+ break;
+ }
+ }
+
+ private CharsetDecoder getDecoder() throws TikaException {
+ Charset charset = getCharset();
+
+ // Common case: charset is same as last time, so
+ // just reuse it:
+ if (lastCharset == null || !charset.equals(lastCharset)) {
+ decoder = charset.newDecoder();
+ decoder.onMalformedInput(CodingErrorAction.REPLACE);
+ decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ lastCharset = charset;
+ }
+
+ return decoder;
+ }
+
+ // Return current charset in-use
+ private Charset getCharset() throws TikaException {
+ // If a specific font (fN) was set, use its charset
+ if (groupState.fontCharset != null) {
+ return groupState.fontCharset;
+ }
+
+ // Else, if global default font (defN) was set, use that one
+ if (globalDefaultFont != -1 && !inHeader) {
+ Charset cs = fontToCharset.get(globalDefaultFont);
+ if (cs != null) {
+ return cs;
+ }
+ }
+
+ /
<TRUNCATED>