You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/09/15 18:42:07 UTC

svn commit: r1171171 [1/3] - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/sax/ tika-core/src/main/java/org/apache/tika/utils/ tika-parsers/src/main/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/java/org/apache/tika/ tika-parsers/...

Author: mikemccand
Date: Thu Sep 15 16:42:06 2011
New Revision: 1171171

URL: http://svn.apache.org/viewvc?rev=1171171&view=rev
Log:
TIKA-683: new RTF parser that performs its own direct shallow parse (instead of using RTFEditorKit from javax.swing)

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java   (with props)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFBoldItalic.rtf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFControls.rtf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFInvalidUnicode.rtf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFUmlautSpaces2.rtf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFUnicodeUCNControlWordCharacterDoubling.rtf   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFVarious.rtf   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Sep 15 16:42:06 2011
@@ -6,6 +6,14 @@ The most notable changes in Tika 1.0 ove
 
  * A parser for CHM help files was added. (TIKA-245)
 
+ * The RTF parser was rewritten to perform its own direct shallow
+   parse of the RTF content, instead of using RTFEditorKit from
+   javax.swing.  This fixes several issues in the old parser,
+   including doubling of Unicode characters in certain cases
+   (TIKA-683), exceptions on mal-formed RTF docs (TIKA-666), and
+   missing text from some elements (header/footer, hyperlinks,
+   footnotes, text inside pictures).
+
  
 Release 0.9 - 02/13/2011
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java Thu Sep 15 16:42:06 2011
@@ -16,6 +16,11 @@
  */
 package org.apache.tika.sax;
 
+/*
+import java.util.ArrayList;
+import java.util.List;
+*/
+
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -200,12 +205,48 @@ public class SafeContentHandler extends 
         output.write(REPLACEMENT, 0, REPLACEMENT.length);
     }
 
+
+    /*
+    private final List<String> elements = new ArrayList<String>();
+
+    // Called only from assert
+    private boolean verifyStartElement(String name) {
+        // TODO: we could strengthen this to do full
+        // XTHML validation, eg you shouldn't start p inside
+        // another p (but ODF parser, at least, seems to
+        // violate this):
+        //if (name.equals("p")) {
+        //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p");
+        //}
+        elements.add(name);
+        return true;
+    }
+
+    // Called only from assert
+    private boolean verifyEndElement(String name) {
+        assert elements.size() > 0: "end tag=" + name + " with no startElement";
+        final String currentElement = elements.get(elements.size()-1);
+        assert currentElement.equals(name): "mismatched elements open=" + currentElement + " close=" + name;
+        elements.remove(elements.size()-1);
+        return true;
+    }
+
+    // Called only from assert
+    private boolean verifyEndDocument() {
+        assert elements.size() == 0;
+        return true;
+    }
+    */
+
     //------------------------------------------------------< ContentHandler >
 
     @Override
     public void startElement(
             String uri, String localName, String name, Attributes atts)
             throws SAXException {
+        // TODO: enable this, but some parsers currently
+        // trip it
+        //assert verifyStartElement(name);
         // Look for any invalid characters in attribute values.
         for (int i = 0; i < atts.getLength(); i++) {
             if (isInvalid(atts.getValue(i))) {
@@ -231,6 +272,23 @@ public class SafeContentHandler extends 
     }
 
     @Override
+    public void endElement(String uri, String localName, String name)
+            throws SAXException {
+        // TODO: enable this, but some parsers currently
+        // trip it
+        //assert verifyEndElement(name);
+        super.endElement(uri, localName, name);
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        // TODO: enable this, but some parsers currently
+        // trip it
+        //assert verifyEndDocument();
+        super.endDocument();
+    }
+
+    @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
         filter(ch, start, length, charactersOutput);

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java Thu Sep 15 16:42:06 2011
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.utils;
 
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.util.HashMap;
@@ -51,6 +53,9 @@ public class CharsetUtils {
      */
     public static boolean isSupported(String charsetName) {
         try {
+            if (isSupportedICU != null && ((Boolean) isSupportedICU.invoke(null, charsetName)).booleanValue()) {
+                return true;
+            }
             return Charset.isSupported(charsetName);
         } catch (IllegalCharsetNameException e) {
             return false;
@@ -103,11 +108,54 @@ public class CharsetUtils {
         }
         
         try {
-            Charset cs = Charset.forName(result);
+            Charset cs = forName(result);
             return cs.name();
         } catch (Exception e) {
             return null;
         }
     }
 
+    private static Method getCharsetICU;
+    private static Method isSupportedICU;
+
+    static {
+        // See if we can load the icu4j CharsetICU class
+        Class icuCharset = null;
+        try  {
+            icuCharset = CharsetUtils.class.getClassLoader().loadClass("com.ibm.icu.charset.CharsetICU");
+        } 
+        catch (ClassNotFoundException e)  {
+        }
+        if (icuCharset != null) {
+            try {
+                getCharsetICU = icuCharset.getMethod("forNameICU", String.class);
+            } catch (Throwable t) {
+                throw new RuntimeException(t);
+            }
+            try {
+                isSupportedICU = icuCharset.getMethod("isSupported", String.class);
+            } catch (Throwable t) {
+            }
+            // TODO: would be nice to somehow log that we
+            // successfully found ICU
+        }
+    }
+
+    /** Returns Charset impl, if one exists.  This method
+     *  optionally uses ICU4J's CharsetICU.forNameICU,
+     *  if it is found on the classpath, else only uses
+     *  JDK's builtin Charset.forName. */
+    public static Charset forName(String name) {
+        if (getCharsetICU != null) {
+            try {
+                Charset cs = (Charset) getCharsetICU.invoke(null, name);
+                if (cs != null) {
+                    return cs;
+                }
+            } catch (InvocationTargetException ite) {
+            } catch (IllegalAccessException iae) {
+            }
+        }
+        return Charset.forName(name);
+    }
 }

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1171171&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java Thu Sep 15 16:42:06 2011
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+    public int depth;
+    public boolean bold;
+    public boolean italic;
+    // True if we are skipping all text in current group,
+    // eg if group leads with a \*:
+    public boolean ignore;
+    // Default is 1 if no uc control has been seen yet:
+    public int ucSkip = 1;
+    public String fontCharset;
+
+    // Create default (root) GroupState
+    public GroupState() {
+    }
+
+    // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+    public GroupState(GroupState other) {
+        bold = other.bold;
+        italic = other.italic;
+        ignore = other.ignore;
+        ucSkip = other.ucSkip;
+        fontCharset = other.fontCharset;
+        depth = 1+other.depth;
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Thu Sep 15 16:42:06 2011
@@ -16,33 +16,13 @@
  */
 package org.apache.tika.parser.rtf;
 
-import java.io.BufferedOutputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
 import java.util.Collections;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Map;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.swing.text.AttributeSet;
-import javax.swing.text.BadLocationException;
-import javax.swing.text.DefaultStyledDocument;
-import javax.swing.text.Document;
-import javax.swing.text.StyleContext;
-import javax.swing.text.rtf.RTFEditorKit;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TaggedInputStream;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
@@ -59,60 +39,6 @@ public class RTFParser extends AbstractP
     private static final Set<MediaType> SUPPORTED_TYPES = Collections
             .singleton(MediaType.application("rtf"));
 
-    private static final Pattern F_PATTERN = Pattern.compile("\\\\a?f([0-9]+)");
-
-    private static final Pattern FCHARSET_PATTERN = Pattern
-            .compile("\\\\fcharset[0-9]+");
-
-    private static final Pattern ANSICPG_PATTERN = Pattern
-            .compile("\\\\ansicpg[0-9]+");
-
-    private static final Pattern DEFAULT_FONT_PATTERN = Pattern.compile("\\\\deff(0-9)+");
-
-    private static final Pattern FONT_FAMILY_PATTERN = Pattern.compile("\\\\f(nil|roman|swiss|modern|script|decor|tech|bidi)");
-
-    private static Map<Integer, String> FONTSET_MAP = new HashMap<Integer, String>();
-    static {
-        FONTSET_MAP.put(0, "windows-1251"); // ANSI
-        // charset 1 is Default
-        // charset 2 is Symbol
-
-        FONTSET_MAP.put(77, "MacRoman"); // Mac Roman
-        FONTSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
-        FONTSET_MAP.put(79, "ms949"); // Mac Hangul
-        FONTSET_MAP.put(80, "GB2312"); // Mac GB2312
-        FONTSET_MAP.put(81, "Big5"); // Mac Big5
-        FONTSET_MAP.put(82, "johab"); // Mac Johab (old)
-        FONTSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
-        FONTSET_MAP.put(84, "MacArabic"); // Mac Arabic
-        FONTSET_MAP.put(85, "MacGreek"); // Mac Greek
-        FONTSET_MAP.put(86, "MacTurkish"); // Mac Turkish
-        FONTSET_MAP.put(87, "MacThai"); // Mac Thai
-        FONTSET_MAP.put(88, "cp1250"); // Mac East Europe
-        FONTSET_MAP.put(89, "cp1251"); // Mac Russian
-
-        FONTSET_MAP.put(128, "MS932"); // Shift JIS
-        FONTSET_MAP.put(129, "ms949"); // Hangul
-        FONTSET_MAP.put(130, "ms1361"); // Johab
-        FONTSET_MAP.put(134, "ms936"); // GB2312
-        FONTSET_MAP.put(136, "ms950"); // Big5
-        FONTSET_MAP.put(161, "cp1253"); // Greek
-        FONTSET_MAP.put(162, "cp1254"); // Turkish
-        FONTSET_MAP.put(163, "cp1258"); // Vietnamese
-        FONTSET_MAP.put(177, "cp1255"); // Hebrew
-        FONTSET_MAP.put(178, "cp1256"); // Arabic
-        // FONTSET_MAP.put( 179, "" ); // Arabic Traditional
-        // FONTSET_MAP.put( 180, "" ); // Arabic user
-        // FONTSET_MAP.put( 181, "" ); // Hebrew user
-        FONTSET_MAP.put(186, "cp1257"); // Baltic
-
-        FONTSET_MAP.put(204, "cp1251"); // Russian
-        FONTSET_MAP.put(222, "ms874"); // Thai
-        FONTSET_MAP.put(238, "cp1250"); // Eastern European
-        FONTSET_MAP.put(254, "cp437"); // PC 437
-        FONTSET_MAP.put(255, "cp850"); // OEM
-    }
-
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -120,294 +46,15 @@ public class RTFParser extends AbstractP
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
+        throws IOException, SAXException, TikaException {
         TaggedInputStream tagged = new TaggedInputStream(stream);
-        TemporaryResources tmp = new TemporaryResources();
         try {
-            File tempFile = tmp.createTemporaryFile();
-            createUnicodeRtfTempFile(tempFile, stream);
-
-            InputStream in = TikaInputStream.get(tempFile);
-            try {
-                Document sd = new CustomStyledDocument();
-                new RTFEditorKit().read(in, sd, 0);
-
-                XHTMLContentHandler xhtml =
-                    new XHTMLContentHandler(handler, metadata);
-                xhtml.startDocument();
-                xhtml.element("p", sd.getText(0, sd.getLength()));
-                xhtml.endDocument();
-                
-                // TODO Extract some of the metadata
-                metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
-            } finally {
-                in.close();
-            }
+            final TextExtractor ert = new TextExtractor(new XHTMLContentHandler(handler, metadata), metadata);
+            ert.extract(stream);
+            metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
         } catch (IOException e) {
             tagged.throwIfCauseOf(e);
             throw new TikaException("Error parsing an RTF document", e);
-        } catch (BadLocationException e) {
-            throw new TikaException("Error parsing an RTF document", e);
-        } catch (NullPointerException e) {
-            // TIKA-621: RTF parsing fails with Java 7 early access
-            // on 64bit platforms
-            throw new TikaException("Error parsing an RTF document", e);
-        } finally {
-            tmp.dispose();
-        }
-    }
-
-    private String escapeByUnicode(String data, String enc) {
-        StringBuilder dataBuf = new StringBuilder(data.length() + 16);
-        StringBuilder keywordBuf = new StringBuilder(4);
-        StringBuilder origDataBuf = new StringBuilder();
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        for (int i = 0; i < data.length(); i++) {
-            char c1 = data.charAt(i);
-            keywordBuf.append(c1);
-            if (c1 == '\\' && data.length()>i+1) {
-                i++;
-                char c2 = data.charAt(i);
-                keywordBuf.append(c2);
-                if (c2 == '\'') {
-                    i++;
-                    char c3 = data.charAt(i);
-                    keywordBuf.append(c3);
-                    if ((c3 >= '0' && c3 <= '9') || (c3 >= 'a' && c3 <= 'f')
-                            || (c3 >= 'A' && c3 <= 'F')) {
-                        i++;
-                        char c4 = data.charAt(i);
-                        keywordBuf.append(c4);
-                        if ((c4 >= '0' && c4 <= '9')
-                                || (c4 >= 'a' && c4 <= 'f')
-                                || (c4 >= 'A' && c4 <= 'F')) {
-                            int value = Integer.parseInt(
-                                    String.valueOf(new char[] { c3, c4 }), 16);
-                            baos.write(value);
-                            origDataBuf.append(keywordBuf.toString());
-                            keywordBuf.delete(0, 4);
-                            continue;
-                        }
-                    }
-                }
-            }
-            if (baos.size() != 0) {
-                try {
-                    appendUnicodeStr(dataBuf, new String(baos.toByteArray(),
-                            enc));
-                } catch (UnsupportedEncodingException e) {
-                    dataBuf.append(origDataBuf.toString());
-                }
-                origDataBuf.delete(0, origDataBuf.length());
-                baos.reset();
-            }
-            dataBuf.append(keywordBuf.toString());
-            keywordBuf.delete(0, 4);
-        }
-
-        if (baos.size() != 0) {
-            try {
-                appendUnicodeStr(dataBuf, new String(baos.toByteArray(), enc));
-            } catch (UnsupportedEncodingException e) {
-                dataBuf.append(origDataBuf.toString());
-            }
-        }
-
-        return dataBuf.toString();
-    }
-
-    private void appendUnicodeStr(StringBuilder dataBuf, String value) {
-        for (int j = 0; j < value.length(); j++) {
-            char ch = value.charAt(j);
-            if (ch >= 20 && ch < 80) {
-                dataBuf.append(ch);
-            } else {
-                dataBuf.append("{\\u");
-                dataBuf.append((int) ch);
-                dataBuf.append('}');
-            }
-        }
-    }
-
-    private void createUnicodeRtfTempFile(File tempFile, InputStream in)
-            throws IOException {
-        OutputStream out =
-            new BufferedOutputStream(new FileOutputStream(tempFile));
-        try {
-            String defaultCharset = "windows-1251"; // ansi
-            String defaultFont = "0";
-            Map<String, String> fontTableMap = new HashMap<String, String>();
-            StringBuilder dataBuf = new StringBuilder(255);
-            int ch;
-            LinkedList<String> charsetQueue = new LinkedList<String>();
-            int depth = 0;
-            String prevFt = null;
-            int prevCh = -1;
-            while ((ch = in.read()) != -1) {
-                if ( ((ch == '{' || ch == '}') && prevCh!='\\') || ( ch == ' ' && (! FONT_FAMILY_PATTERN.matcher(dataBuf.toString()).find())) ) {
-                    if (charsetQueue.size() > depth + 1) {
-                        charsetQueue.removeLast();
-                    }
-
-                    String data = dataBuf.toString();
-                    data = data.replace("\\cell","\\u0020\\cell");
-
-                    if(data.indexOf("\\colortbl")!=-1){
-                        // End of font table, clear last/previous font encountered.
-                        prevFt = null;
-                    }
-
-                    if (depth == 1) {
-                        // check control words for a default charset
-                        String cset = loadAnsiCpg(data);
-                        if (cset != null) {
-                            defaultCharset = cset;
-                        }
-                        Matcher matcher = DEFAULT_FONT_PATTERN.matcher(data);
-                        if(matcher.find()){
-                            defaultFont = matcher.group(1);
-                        }
-                    }
-
-                    String ft = loadFontTable(data);
-                    String charset = loadCharset(data);
-                    if (ft != null && charset != null) {
-                        fontTableMap.put(ft, charset);
-                    }
-
-                    if (ft == null && prevCh == ' ') {
-                        ft = prevFt;
-                    } else if (ft != null) {
-                        prevFt = ft;
-                    }
-                    if(ft==null){
-                        ft = defaultFont;
-                    }
-
-                    // set a current charset
-                    if (charset == null && ft != null) {
-                        charset = fontTableMap.get(ft);
-                    }
-                    if (charset == null && charsetQueue.size() > 0) {
-                        charset = charsetQueue.getLast();
-                    }
-                    if (charset == null) {
-                        charset = defaultCharset;
-                    }
-
-                    // add the current charset to a queue
-                    if (charsetQueue.size() < depth + 1) {
-                        charsetQueue.add(charset);
-                    }
-
-                    String escapedStr = "windows-1251".equals(charset) ? data
-                            : escapeByUnicode(data, charset);
-                    out.write(escapedStr.getBytes("UTF-8"));
-                    out.write(ch);
-                    dataBuf.delete(0, dataBuf.length());
-
-                    prevCh = ch;
-
-                    // update a depth
-                    if (ch == '{') {
-                        depth++;
-                    } else if (ch == '}') {
-                        depth--;
-                    }
-                } else {
-                    dataBuf.append((char) ch);
-                }
-            }
-        } finally {
-            out.close();
-        }
-    }
-
-    private String loadFontTable(String line) {
-        Matcher m = F_PATTERN.matcher(line);
-        String font = null;
-        while((m.find())) {
-            font = m.group(1);
-        }
-        return font;
-    }
-
-    private String loadAnsiCpg(String line) {
-        Matcher m = ANSICPG_PATTERN.matcher(line);
-        String charset = null;
-        if (m.find()) {
-            int encVal;
-            try {
-                encVal = Integer.parseInt(m.group().substring(8));
-                charset = FONTSET_MAP.get(encVal);
-            } catch (NumberFormatException e) {
-                // ignore
-            }
-        }
-
-        return charset;
-    }
-
-    private String loadCharset(String line) {
-        Matcher m = FCHARSET_PATTERN.matcher(line);
-        String charset = null;
-        if (m.find()) {
-            int encVal;
-            try {
-                encVal = Integer.parseInt(m.group().substring(9));
-            } catch (NumberFormatException e) {
-                encVal = 0;
-            }
-            charset = FONTSET_MAP.get(encVal);
         }
-
-        return charset;
-    }
-
-    /**
-     * Customized version of {@link DefaultStyledDocument}. Adds whitespace
-     * to places where words otherwise could have run together (see
-     * <a href="https://issues.apache.org/jira/browse/TIKA-392">TIKA-392</a>),
-     * and works around the problem of Swing expecting a GUI environment (see
-     * <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>).
-     */
-    private static class CustomStyledDocument extends DefaultStyledDocument {
-        private boolean isPrevUnicode = false;
-
-        public CustomStyledDocument() {
-            super(new NoReclaimStyleContext());
-        }
-
-        @Override
-        public void insertString(int offs, String str, AttributeSet a)
-                throws BadLocationException {
-            boolean isUnicode = str.length() == 1 && str.charAt(0) > 127;
-
-            if (offs > 0 && offs == getLength() && !isPrevUnicode && !isUnicode) {
-                super.insertString(offs, " ", a);
-                super.insertString(getLength(), str, a);
-            } else {
-                super.insertString(offs, str, a);
-            }
-
-            isPrevUnicode = isUnicode;
-        }
-
-    }
-
-    /**
-     * A workaround to
-     * <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>:
-     * RTF parser expects a GUI environment. This class simply disables the
-     * troublesome SwingUtilities.isEventDispatchThread() call that's made in
-     * the {@link StyleContext#reclaim(AttributeSet)} method.
-     */
-    private static class NoReclaimStyleContext extends StyleContext {
-
-        /** Ignored. */
-        public void reclaim(AttributeSet a) {
-        }
-
     }
-
 }

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1171171&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Thu Sep 15 16:42:06 2011
@@ -0,0 +1,1027 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+    // Hold pending bytes (encoded in the current charset)
+    // for text output:
+    private byte[] pendingBytes = new byte[16];
+    private int pendingByteCount;
+    private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+      
+    // Holds pending chars for text output
+    private char[] pendingChars = new char[10];
+    private int pendingCharCount;
+
+    // Holds chars for a still-being-tokenized control word
+    private byte[] pendingControl = new byte[10];
+    private int pendingControlCount;
+
+    // Used when we decode bytes -> chars using CharsetDecoder:
+    private final char[] outputArray = new char[128];
+    private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+
+    // Reused when possible:
+    private CharsetDecoder decoder;
+    private String lastCharset;
+
+    private String globalCharset = "windows-1252";
+    private int globalDefaultFont = -1;
+    private int curFontID = -1;
+
+    // Holds the font table from this RTF doc, mapping
+    // the font number (from \fN control word) to the
+    // corresponding charset:
+    private final Map<Integer,String> fontToCharset = new HashMap<Integer,String>();
+
+    // Group stack: when we open a new group, we push
+    // the previous group state onto the stack; when we
+    // close the group, we restore it
+    private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+
+    // Current group state; in theory this initial
+    // GroupState is unused because the RTF doc should
+    // immediately open the top group (start with {):
+    private GroupState groupState = new GroupState();
+
+    private boolean inHeader = true;
+    private int fontTableState;
+    private int fontTableDepth;
+
+    // Non null if we are processing metadata (title,
+    // keywords, etc.) inside the info group:
+    private String nextMetaData;
+    private boolean inParagraph;
+
+    private final StringBuilder headerBuffer = new StringBuilder();
+
+    // Used to process the sub-groups inside the upr
+    // group:
+    private int uprState = -1;
+
+    private final XHTMLContentHandler out;
+    private final Metadata metadata;
+
+    // How many next ansi chars we should skip; this
+    // is 0 except when we are still in the "ansi
+    // shadow" after seeing a unicode escape, at which
+    // point it's set to the last ucN skip we had seen:
+    int ansiSkip = 0;
+
+    // The RTF doc has a "font table" that assigns ords
+    // (f0, f1, f2, etc.) to fonts and charsets, using the
+    // \fcharsetN control word.  This mapping maps from the
+    // N to corresponding Java charset:
+    private static final Map<Integer, String> FCHARSET_MAP = new HashMap<Integer, String>();
+    static {
+        FCHARSET_MAP.put(0, "windows-1252"); // ANSI
+        // charset 1 is Default
+        // charset 2 is Symbol
+
+        FCHARSET_MAP.put(77, "MacRoman"); // Mac Roman
+        FCHARSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
+        FCHARSET_MAP.put(79, "ms949"); // Mac Hangul
+        FCHARSET_MAP.put(80, "GB2312"); // Mac GB2312
+        FCHARSET_MAP.put(81, "Big5"); // Mac Big5
+        FCHARSET_MAP.put(82, "johab"); // Mac Johab (old)
+        FCHARSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
+        FCHARSET_MAP.put(84, "MacArabic"); // Mac Arabic
+        FCHARSET_MAP.put(85, "MacGreek"); // Mac Greek
+        FCHARSET_MAP.put(86, "MacTurkish"); // Mac Turkish
+        FCHARSET_MAP.put(87, "MacThai"); // Mac Thai
+        FCHARSET_MAP.put(88, "cp1250"); // Mac East Europe
+        FCHARSET_MAP.put(89, "cp1251"); // Mac Russian
+
+        FCHARSET_MAP.put(128, "MS932"); // Shift JIS
+        FCHARSET_MAP.put(129, "ms949"); // Hangul
+        FCHARSET_MAP.put(130, "ms1361"); // Johab
+        FCHARSET_MAP.put(134, "ms936"); // GB2312
+        FCHARSET_MAP.put(136, "ms950"); // Big5
+        FCHARSET_MAP.put(161, "cp1253"); // Greek
+        FCHARSET_MAP.put(162, "cp1254"); // Turkish
+        FCHARSET_MAP.put(163, "cp1258"); // Vietnamese
+        FCHARSET_MAP.put(177, "cp1255"); // Hebrew
+        FCHARSET_MAP.put(178, "cp1256"); // Arabic
+        // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+        // FCHARSET_MAP.put( 180, "" ); // Arabic user
+        // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+        FCHARSET_MAP.put(186, "cp1257"); // Baltic
+
+        FCHARSET_MAP.put(204, "cp1251"); // Russian
+        FCHARSET_MAP.put(222, "ms874"); // Thai
+        FCHARSET_MAP.put(238, "cp1250"); // Eastern European
+        FCHARSET_MAP.put(254, "cp437"); // PC 437
+        FCHARSET_MAP.put(255, "cp850"); // OEM
+    }
+
+    // The RTF may specify the \ansicpgN charset in the
+    // header; this maps the N to the corresponding Java
+    // character set:
+
+    private static final Map<Integer, String> ANSICPG_MAP = new HashMap<Integer, String>();
+    static {
+        ANSICPG_MAP.put(437, "CP437");   // US IBM
+        ANSICPG_MAP.put(708, "ISO-8859-6");   // Arabic (ASMO 708)
+      
+        ANSICPG_MAP.put(709, "windows-709");  // Arabic (ASMO 449+, BCON V4)
+        ANSICPG_MAP.put(710, "windows-710");  // Arabic (transparent Arabic)
+        ANSICPG_MAP.put(710, "windows-711");  // Arabic (Nafitha Enhanced)
+        ANSICPG_MAP.put(710, "windows-720");  // Arabic (transparent ASMO)
+        ANSICPG_MAP.put(819, "CP819");  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(819, "CP819");  // Windows 3.1 (US & Western Europe)
+
+        ANSICPG_MAP.put(819, "CP819");  // Windows 3.1 (US & Western Europe)
+        ANSICPG_MAP.put(850, "CP850");  // IBM Multilingual
+        ANSICPG_MAP.put(852, "CP852");  // Eastern European
+        ANSICPG_MAP.put(860, "CP860");  // Portuguese
+        ANSICPG_MAP.put(862, "CP862");  // Hebrew
+        ANSICPG_MAP.put(863, "CP863");  // French Canadian
+        ANSICPG_MAP.put(864, "CP864");  // Arabic
+        ANSICPG_MAP.put(865, "CP865");  // Norwegian
+        ANSICPG_MAP.put(866, "CP866");  // Soviet Union
+        ANSICPG_MAP.put(874, "MS874");  // Thai
+        ANSICPG_MAP.put(932, "MS932");  // Japanese
+        ANSICPG_MAP.put(936, "MS936");  // Simplified Chinese
+        ANSICPG_MAP.put(949, "CP949");  // Korean
+        ANSICPG_MAP.put(950, "CP950");  // Traditional Chinese
+        ANSICPG_MAP.put(1250, "CP1250");  // Eastern European
+        ANSICPG_MAP.put(1251, "CP1251");  // Cyrillic
+        ANSICPG_MAP.put(1252, "CP1252");  // Western European
+        ANSICPG_MAP.put(1253, "CP1253");  // Greek
+        ANSICPG_MAP.put(1254, "CP1254");  // Turkish
+        ANSICPG_MAP.put(1255, "CP1255");  // Hebrew
+        ANSICPG_MAP.put(1256, "CP1256");  // Arabic
+        ANSICPG_MAP.put(1257, "CP1257");  // Baltic
+        ANSICPG_MAP.put(1258, "CP1258");  // Vietnamese
+        ANSICPG_MAP.put(1361, "x-Johab");  // Johab
+        ANSICPG_MAP.put(10000, "MacRoman");  // Mac Roman
+        ANSICPG_MAP.put(10001, "Shift_JIS");  // Mac Japan
+        ANSICPG_MAP.put(10004, "MacArabic");  // Mac Arabic
+        ANSICPG_MAP.put(10005, "MacHebrew");  // Mac Hebrew
+        ANSICPG_MAP.put(10006, "MacGreek");  // Mac Hebrew
+        ANSICPG_MAP.put(10007, "MacCyrillic");  // Mac Cyrillic
+        ANSICPG_MAP.put(10029, "x-MacCentralEurope");  // MAC Latin2
+        ANSICPG_MAP.put(10081, "MacTurkish");  // Mac Turkish
+        ANSICPG_MAP.put(57002, "x-ISCII91");   // Devanagari
+
+        // TODO: in theory these other charsets are simple
+        // shifts off of Devanagari, so we could impl that
+        // here:
+        ANSICPG_MAP.put(57003, "windows-57003");   // Bengali
+        ANSICPG_MAP.put(57004, "windows-57004");   // Tamil
+        ANSICPG_MAP.put(57005, "windows-57005");   // Telugu
+        ANSICPG_MAP.put(57006, "windows-57006");   // Assamese
+        ANSICPG_MAP.put(57007, "windows-57007");   // Oriya
+        ANSICPG_MAP.put(57008, "windows-57008");   // Kannada
+        ANSICPG_MAP.put(57009, "windows-57009");   // Malayalam
+        ANSICPG_MAP.put(57010, "windows-57010");   // Gujariti
+        ANSICPG_MAP.put(57011, "windows-57011");   // Punjabi
+    }
+
+    public TextExtractor(XHTMLContentHandler out, Metadata metadata) {
+        this.metadata = metadata;
+        this.out = out;
+    }
+
+    private static boolean isHexChar(char ch) {
+        return (ch >= '0' && ch <= '9') ||
+            (ch >= 'a' && ch <= 'f') ||
+            (ch >= 'A' && ch <= 'F');
+    }
+
+    private static boolean isAlpha(char ch) {
+        return (ch >= 'a' && ch <= 'z') ||
+            (ch >= 'A' && ch <= 'Z');
+    }
+
+    private static boolean isDigit(char ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+    private static int hexValue(char ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            assert ch >= 'A' && ch <= 'Z';
+            return 10 + (ch - 'A');
+        }
+    }
+
+    // Push pending bytes or pending chars:
+    private void pushText() throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            assert pendingCharCount == 0;
+            pushBytes();
+        } else {
+            pushChars();
+        }
+    }
+
+    // Buffers the byte (unit in the current charset) for
+    // output:
+    private void addOutputByte(byte b) throws IOException, SAXException, TikaException {
+
+        if (pendingCharCount != 0) {
+            pushChars();
+        }
+
+        // Save the byte in pending buffer:
+        if (pendingByteCount == pendingBytes.length) {
+            // Gradual but exponential growth:
+            final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)];
+            System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+            pendingBytes = newArray;
+            pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+        }
+        pendingBytes[pendingByteCount++] = b;
+    }
+
+    // Buffers a byte as part of a control word:
+    private void addControl(byte b) {
+        assert isAlpha((char) b);
+        // Save the byte in pending buffer:
+        if (pendingControlCount == pendingControl.length) {
+            // Gradual but exponential growth:
+            final byte[] newArray = new byte[(int) (pendingControl.length*1.25)];
+            System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+            pendingControl = newArray;
+        }
+        pendingControl[pendingControlCount++] = b;
+    }
+
+    // Buffers a UTF16 code unit for output
+    private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+        if (pendingByteCount != 0) {
+            pushBytes();
+        }
+
+        if (inHeader) {
+            headerBuffer.append(ch);
+        } else {
+            if (pendingCharCount == pendingChars.length) {
+                // Gradual but exponential growth:
+                final char[] newArray = new char[(int) (pendingChars.length*1.25)];
+                System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+                pendingChars = newArray;
+            }
+            pendingChars[pendingCharCount++] = ch;
+        }
+    }
+
+    // Shallow parses the entire doc, writing output to
+    // this.out and this.metadata
+    public void extract(InputStream in) throws IOException, SAXException, TikaException {
+        out.startDocument();
+
+        int state = 0;
+        int pushBack = -2;
+        boolean negParam = false;
+        char hex1 = 0;
+        long param = 0;
+
+        while (true) {
+            final int b;
+            if (pushBack != -2) {
+                b = pushBack;
+                pushBack = -2;
+            } else {
+                b = in.read();
+            }
+            if (b == -1) {
+                break;
+            }
+
+            // NOTE: this is always a 8bit clean byte (ie
+            // < 128), but we use a char for
+            // convenience in the testing below:
+            final char ch = (char) b;
+
+            switch (state) {
+
+            case 0:
+                if (ch == '\\') {
+                    state = 1;
+                } else if (ch == '{') {
+                    pushText();
+                    processGroupStart();
+                } else if (ch == '}') {
+                    pushText();
+                    processGroupEnd();
+                } else if (ch != '\r' && ch != '\n' && (!groupState.ignore || nextMetaData != null)) {
+                    // Linefeed and carriage return are not
+                    // significant
+                    if (ansiSkip != 0) {
+                        ansiSkip--;
+                    } else {
+                        addOutputByte((byte) ch);
+                    }
+                }
+                break;
+
+            // saw \
+            case 1:
+                if (ch == '\'') {
+                    // escaped hex char
+                    state = 2;
+                } else if (isAlpha(ch)) {
+                    // control word
+                    //pushText();
+                    addControl((byte) ch);
+                    state = 4;
+                } else if (ch == '{' || ch == '}' || ch == '\\' || ch == '\r' || ch == '\n') {
+                    // escaped char
+                    addOutputByte((byte) ch);
+                    state = 0;
+                } else {
+                    // control symbol, eg \* or \~
+                    //pushText();
+                    processControlSymbol(ch);
+                    state = 0;
+                }
+                break;
+
+            // saw \'
+            case 2:
+                if (isHexChar(ch)) {
+                    hex1 = ch;
+                    state = 3;
+                } else {
+                    // DOC ERROR (malformed hex escape): ignore 
+                    state = 0;
+                }
+                break;
+
+            // saw \'x
+            case 3: 
+                if (isHexChar(ch)) {
+                    if (ansiSkip != 0) {
+                        // Skip this ansi char since we are
+                        // still in the shadow of a unicode
+                        // escape:
+                        ansiSkip--;
+                    } else {
+                        // Unescape:
+                        addOutputByte((byte) (16*hexValue(hex1) + hexValue(ch)));
+                    }
+                    state = 0;
+                } else {
+                    // TODO: log a warning here, somehow?
+                    // DOC ERROR (malformed hex escape):
+                    // ignore
+                    state = 0;
+                }
+                break;
+
+            // inside control word
+            case 4:
+                if (isAlpha(ch)) {
+                    // still in control word
+                    addControl((byte) ch);
+                } else if (ch == '-') {
+                    // end of control word, start of negative parameter
+                    negParam = true;
+                    param = 0;
+                    state = 5;
+                } else if (isDigit(ch)) {
+                    // end of control word, start of positive parameter
+                    negParam = false;
+                    param = (long) (ch - '0');
+                    state = 5;
+                } else if (ch == ' ') {
+                    // space is consumed as part of the
+                    // control word, but is not added to the
+                    // control word
+                    processControlWord();
+                    pendingControlCount = 0;
+                    state = 0;
+                } else {
+                    processControlWord();
+                    pendingControlCount = 0;
+                    // eps transition back to start state
+                    pushBack = ch;
+                    state = 0;
+                }
+                break;
+
+            // inside control word's numeric param
+            case 5:
+                if (isDigit(ch)) {
+                    param = (10*param) + (long) (ch - '0');
+                } else {
+                    if (negParam) {
+                        param = -param;
+                    }
+                    processControlWord(param);
+                    pendingControlCount = 0;
+                    if (ch != ' ') {
+                        // space is consumed as part of the
+                        // control word
+                        pushBack = ch;
+                    }
+                    state = 0;
+                }
+                break;
+              
+            default:
+                throw new RuntimeException("invalid state");
+            }
+        }
+
+        endParagraph(false);
+        out.endDocument();
+    }
+
+    private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+        if (!inParagraph) {
+            // Ensure </i></b> order
+            if (groupState.italic) {
+                end("i");
+            }
+            if (groupState.bold) {
+                end("b");
+            }
+            out.startElement("p");
+            // Ensure <b><i> order
+            if (groupState.bold) {
+                start("b");
+            }
+            if (groupState.italic) {
+                start("i");
+            }
+            inParagraph = true;
+        }
+    }
+
+    private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+        pushText();
+        if (inParagraph) {
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = preserveStyles;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = preserveStyles;
+            }
+            out.endElement("p");
+            if (preserveStyles && (groupState.bold || groupState.italic)) {
+                start("p");
+                if (groupState.bold) {
+                    start("b");
+                }
+                if (groupState.italic) {
+                    start("i");
+                }
+                inParagraph = true;
+            } else {
+                inParagraph = false;
+            }
+        }
+    }
+
+    // Push pending UTF16 units to out ContentHandler
+    private void pushChars() throws IOException, SAXException, TikaException {
+        if (pendingCharCount != 0) {
+            lazyStartParagraph();
+            out.characters(pendingChars, 0, pendingCharCount);
+            pendingCharCount = 0;
+        }
+    }
+
+    // Decodes the buffered bytes in pendingBytes
+    // into UTF16 code units, and sends the characters
+    // to the out ContentHandler, if we are in the body,
+    // else appends the characters to the headerBuffer
+    private void pushBytes() throws IOException, SAXException, TikaException {
+        if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+            final CharsetDecoder decoder = getDecoder();
+            pendingByteBuffer.limit(pendingByteCount);
+            assert pendingByteBuffer.position() == 0;
+            assert outputBuffer.position() == 0;
+
+            while (true) {
+                // We pass true for endOfInput because, when
+                // we are called, we should have seen a
+                // complete sequence of characters for this
+                // charset:
+                final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader) {
+                        headerBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            while (true) {
+                final CoderResult result = decoder.flush(outputBuffer);
+
+                final int pos = outputBuffer.position();
+                if (pos > 0) {
+                    if (inHeader) {
+                        headerBuffer.append(outputArray, 0, pos);
+                    } else {
+                        lazyStartParagraph();
+                        out.characters(outputArray, 0, pos);
+                    }
+                    outputBuffer.position(0);
+                }
+
+                if (result == CoderResult.UNDERFLOW) {
+                    break;
+                }
+            }
+
+            // Reset for next decode
+            decoder.reset();
+            pendingByteBuffer.position(0);
+        }
+
+        pendingByteCount = 0;
+    }
+
+    // NOTE: s must be ascii alpha only
+    private boolean equals(String s) {
+        if (pendingControlCount != s.length()) {
+            return false;
+        }
+        for(int idx=0;idx<pendingControlCount;idx++) {
+            assert isAlpha(s.charAt(idx));
+            if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+        switch(ch) {
+        case '~':
+            // Non-breaking space -> unicode NON-BREAKING SPACE
+            addOutputChar('\u00a0');
+            break;
+        case '*':
+            // Ignorable destination (control words defined
+            // after the 1987 RTF spec).  Note that
+            // sometimes we un-ignore within this group, eg
+            // when handling upr escape.
+            groupState.ignore = true;
+            break;
+        case '-':
+            // Optional hyphen -> unicode SOFT HYPHEN
+            addOutputChar('\u00ad');
+            break;
+        case '_':
+            // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+            addOutputChar('\u2011');
+            break;
+        default:
+            break;
+        }
+    }
+
+    private CharsetDecoder getDecoder() throws TikaException {
+        final String charset = getCharset();
+          
+        // Common case: charset is same as last time, so
+        // just reuse it:
+        if (lastCharset == null || !charset.equals(lastCharset)) {
+            decoder = CharsetUtils.forName(charset).newDecoder();
+            if (decoder == null) {
+                throw new TikaException("cannot find decoder for charset=" + charset);
+            }
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            lastCharset = charset;
+        }
+
+        return decoder;
+    }
+
+    // Return current charset in-use
+    private String getCharset() throws TikaException {
+        // If a specific font (fN) was set, use its charset
+        if (groupState.fontCharset != null) {
+            return groupState.fontCharset;
+        }
+
+        // Else, if global default font (defN) was set, use
+        // that
+        if (globalDefaultFont != -1 && !inHeader) {
+            final String cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+
+        // Else, use the global charset
+        if (globalCharset == null) {
+            throw new TikaException("unable to determine charset");
+        }
+
+        return globalCharset;
+    }
+
+    // Handle control word that takes a parameter:
+    // Param is long because spec says max value is 1+ Integer.MAX_VALUE!
+    private void processControlWord(long param) throws IOException, SAXException, TikaException {
+
+        // TODO: afN?  (associated font number)
+
+        // TODO: do these alter text output...?
+        /*
+            } else if (equals("stshfdbch")) {
+                // font to be used by default in
+                // style sheet for East Asian chars
+                // arg N is font table entry
+            } else if (equals("stshfloch")) {
+                // font to be used by default in
+                // style sheet for ASCII chars
+                // arg N is font table entry
+            } else if (equals("stshfhich")) {
+                // font to be used by default in
+                // style sheet for High Ansi chars
+                // arg N is font table entry
+            } else if (equals("stshfbi")) {
+                // style sheet for Complex Scripts (BIDI) chars
+                // arg N is font table entry
+                */
+
+        // TODO: inefficient that we check equals N times;
+        // we'd get better perf w/ real lexer (eg
+        // JFlex), which uses single-pass FSM to do cmp:
+        if (inHeader) {
+            if (equals("ansicpg")) {
+                // ANSI codepage
+                final String cs = ANSICPG_MAP.get((int) param);
+                if (cs != null) {
+                    globalCharset = cs;
+                }
+            } else if (equals("deff")) {
+                // Default font
+                globalDefaultFont = (int) param;
+            }
+
+            if (fontTableState == 1) {
+                // Still inside font table -- record the
+                // mappings of fN to the fcharset:
+                if (groupState.depth < fontTableDepth) {
+                    fontTableState = 2;
+                } else {
+                    if (equals("f")) {
+                        // Start new font definition
+                        curFontID = (int) param;
+                    } else if (equals("fcharset")) {
+                        final String cs = FCHARSET_MAP.get((int) param);
+                        if (cs != null) {
+                            fontToCharset.put(curFontID, cs);
+                        }
+                    }
+                }
+            }
+        } else {
+            // In document
+            if (equals("b")) {
+                // b0
+                assert param == 0;
+                if (groupState.bold) {
+                    pushText();
+                    if (groupState.italic) {
+                        end("i");
+                    }
+                    end("b");
+                    if (groupState.italic) {
+                        start("i");
+                    }
+                    groupState.bold = false;
+                }
+            } else if (equals("i")) {
+                // i0
+                assert param == 0;
+                if (groupState.italic) {
+                    pushText();
+                    end("i");
+                    groupState.italic = false;
+                }
+            } else if (equals("f")) {
+                // Change current font
+                final String fontCharset = fontToCharset.get((int) param);
+                if (fontCharset != null) {
+                    groupState.fontCharset = fontCharset;
+                } else {
+                    // DOC ERROR: font change referenced a
+                    // non-table'd font number
+                    // TODO: log a warning?  Throw an exc?
+                    groupState.fontCharset = null;
+                }
+            }
+        }
+
+        // Process unicode escape.  This can appear in doc
+        // or in header, since the metadata (info) fields
+        // in the header can be unicode escaped as well:
+        if (pendingControl[0] == 'u') {
+            if (pendingControlCount == 1) {
+                // Unicode escape
+                if (!groupState.ignore) {
+                    final char utf16CodeUnit = (char) (((int) param) & 0xffff);
+                    addOutputChar(utf16CodeUnit);
+                }
+
+                // After seeing a unicode escape we must
+                // skip the next ucSkip ansi chars (the
+                // "unicode shadow")
+                ansiSkip = groupState.ucSkip;
+            } else if (pendingControlCount == 2 && pendingControl[1] == 'c') {
+                // Change unicode shadow length
+                groupState.ucSkip = (int) param;
+            }
+        }
+    }
+
+    private void end(String tag) throws IOException, SAXException, TikaException {
+        out.endElement(tag);
+    }
+
+    private void start(String tag) throws IOException, SAXException, TikaException {
+        out.startElement(tag);
+    }
+
+    // Handle non-parameter control word:
+    private void processControlWord() throws IOException, SAXException, TikaException {
+        if (inHeader) {
+            if (equals("ansi")) {
+                globalCharset = "cp1252";
+            } else if (equals("pca")) { 
+                globalCharset = "cp850";
+            } else if (equals("pc")) { 
+                globalCharset = "cp437";
+            } else if (equals("mac")) { 
+                globalCharset = "MacRoman";
+            }
+
+            if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
+                groupState.ignore = true;
+            }
+
+            if (uprState == -1) {
+                // TODO: we can also parse \creatim, \revtim,
+                // \printim, \version, \nofpages, \nofwords,
+                // \nofchars, etc.
+                if (equals("author")) {
+                    nextMetaData = Metadata.AUTHOR;
+                } else if (equals("title")) {
+                    nextMetaData = Metadata.TITLE;
+                } else if (equals("subject")) {
+                    nextMetaData = Metadata.SUBJECT;
+                } else if (equals("keywords")) {
+                    nextMetaData = Metadata.KEYWORDS;
+                } else if (equals("category")) {
+                    nextMetaData = Metadata.CATEGORY;
+                } else if (equals("comment")) {
+                    nextMetaData = Metadata.COMMENT;
+                } else if (equals("company")) {
+                    nextMetaData = Metadata.COMPANY;
+                } else if (equals("manager")) {
+                    nextMetaData = Metadata.MANAGER;
+                } else if (equals("template")) {
+                    nextMetaData = Metadata.TEMPLATE;
+                }
+            }
+
+            if (fontTableState == 0) {
+                // Didn't see font table yet
+                if (equals("fonttbl")) {
+                    fontTableState = 1;
+                    fontTableDepth = groupState.depth;
+                }
+            } else if (fontTableState == 1) {
+                // Inside font table
+                if (groupState.depth < fontTableDepth) {
+                    fontTableState = 2;
+                }
+            }
+
+            if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
+                inHeader = false;
+            }
+        } else {
+            if (equals("b")) {
+                if (!groupState.bold) {
+                    pushText();
+                    lazyStartParagraph();
+                    if (groupState.italic) {
+                        // Make sure nesting is always <b><i>
+                        end("i");
+                    }
+                    groupState.bold = true;
+                    start("b");
+                    if (groupState.italic) {
+                        start("i");
+                    }
+                }
+            } else if (equals("i")) {
+                if (!groupState.italic) {
+                    pushText();
+                    lazyStartParagraph();
+                    groupState.italic = true;
+                    start("i");
+                }
+            }
+        }
+
+        if (equals("pard")) {
+            // Reset styles
+            pushText();
+            if (groupState.italic) {
+                end("i");
+                groupState.italic = false;
+            }
+            if (groupState.bold) {
+                end("b");
+                groupState.bold = false;
+            }
+        } else if (equals("par")) {
+            endParagraph(true);
+        } else if (equals("shptxt")) {
+            pushText();
+            // Text inside a shape
+            groupState.ignore = false;
+        } else if (equals("cell")) {
+            // TODO: we should produce a table output here?
+            //addOutputChar(' ');
+            endParagraph(true);
+        } else if (equals("pict")) {
+            pushText();
+            // TODO: create img tag?  but can that support
+            // embedded image data?
+            groupState.ignore = true;
+        } else if (equals("line")) {
+            addOutputChar('\n');
+        } else if (equals("column")) {
+            addOutputChar(' ');
+        } else if (equals("page")) {
+            addOutputChar('\n');
+        } else if (equals("softline")) {
+            addOutputChar('\n');
+        } else if (equals("softcolumn")) {
+            addOutputChar(' ');
+        } else if (equals("softpage")) {
+            addOutputChar('\n');
+        } else if (equals("tab")) {
+            addOutputChar('\t');
+        } else if (equals("upr")) {
+            uprState = 0;
+        } else if (equals("ud") && uprState == 1) {
+            uprState = -1;
+            // 2nd group inside the upr destination, which
+            // contains the unicode encoding of the text, so
+            // we want to keep that:
+            groupState.ignore = false;
+        } else if (equals("bullet")) {
+            // unicode BULLET
+            addOutputChar('\u2022');
+        } else if (equals("endash")) {
+            // unicode EN DASH
+            addOutputChar('\u2013');
+        } else if (equals("emdash")) {
+            // unicode EM DASH
+            addOutputChar('\u2014');
+        } else if (equals("enspace")) {
+            // unicode EN SPACE
+            addOutputChar('\u2002');
+        } else if (equals("qmspace")) {
+            // quarter em space -> unicode FOUR-PER-EM SPACE
+            addOutputChar('\u2005');
+        } else if (equals("emspace")) {
+            // unicode EM SPACE
+            addOutputChar('\u2003');
+        } else if (equals("lquote")) {
+            // unicode LEFT SINGLE QUOTATION MARK
+            addOutputChar('\u2018');
+        } else if (equals("rquote")) {
+            // unicode RIGHT SINGLE QUOTATION MARK
+            addOutputChar('\u2019');
+        } else if (equals("ldblquote")) {
+            // unicode LEFT DOUBLE QUOTATION MARK
+            addOutputChar('\u201C');
+        } else if (equals("rdblquote")) {
+            // unicode RIGHT DOUBLE QUOTATION MARK
+            addOutputChar('\u201D');
+        }
+    }
+
+    // Push new GroupState
+    private void processGroupStart() throws IOException {
+        ansiSkip = 0;
+        // Push current groupState onto the stack
+        groupStates.add(groupState);
+
+        // Make new GroupState
+        groupState = new GroupState(groupState);
+        assert groupStates.size() == groupState.depth: "size=" + groupStates.size() + " depth=" + groupState.depth;
+
+        if (uprState == 0) {
+            uprState = 1;
+            groupState.ignore = true;
+        }
+    }
+
+    // Pop current GroupState
+    private void processGroupEnd() throws IOException, SAXException, TikaException {
+
+        if (inHeader) {
+            if (nextMetaData != null) {
+                metadata.add(nextMetaData, headerBuffer.toString());
+                nextMetaData = null;
+            }
+            headerBuffer.setLength(0);
+        }
+
+        assert groupState.depth > 0;
+        ansiSkip = 0;
+
+        // Restore group state:
+        final GroupState outerGroupState = groupStates.removeLast();
+
+        // Close italic, if outer does not have italic or
+        // bold changed:
+        if (groupState.italic) {
+            if (!outerGroupState.italic ||
+                groupState.bold != outerGroupState.bold) {
+                end("i");
+                groupState.italic = false;
+            }
+        }
+
+        // Close bold
+        if (groupState.bold && !outerGroupState.bold) {
+            end("b");
+        }
+
+        // Open bold
+        if (!groupState.bold && outerGroupState.bold) {
+            start("b");
+        }
+
+        // Open italic
+        if (!groupState.italic && outerGroupState.italic) {
+            start("i");
+        }
+        groupState = outerGroupState;
+        assert groupStates.size() == groupState.depth;
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Thu Sep 15 16:42:06 2011
@@ -55,59 +55,6 @@ public class TestParsers extends TikaTes
         assertEquals(s1, s2);
     }
 
-    public void testRTFExtraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTF.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-    }
-
-    public void testRTFms932Extraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-        // Hello in Japanese
-        assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
-    }
-
-    public void testRTFUmlautSpacesExtraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-        assertTrue(s1.contains("\u00DCbersicht"));
-    }
-
-    public void testRTFWordPadCzechCharactersExtraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-        assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
-        assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
-    }
-
-    public void testRTFWord2010CzechCharactersExtraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-        assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
-        assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
-    }
-
-    public void testRTFTableCellSeparation() throws Exception {
-        File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
-        String s1 = ParseUtils.getStringContent(file, tc);
-        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
-        assertEquals(s1, s2);
-        String content = s1;
-        content = content.replaceAll("\ufffd", " ");
-        content = content.replaceAll("\\s+"," ");
-        assertTrue(content.contains("a b c d \u00E4 \u00EB \u00F6 \u00FC"));
-    }
-
     public void testXMLExtraction() throws Exception {
         File file = getResourceAsFile("/test-documents/testXML.xml");
         String s1 = ParseUtils.getStringContent(file, tc);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Thu Sep 15 16:42:06 2011
@@ -74,6 +74,7 @@ public class PowerPointParserTest extend
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Sep 15 16:42:06 2011
@@ -212,6 +212,7 @@ public class WordParserTest extends Tika
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Sep 15 16:42:06 2011
@@ -482,6 +482,7 @@ public class OOXMLParserTest extends Tik
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
@@ -551,6 +552,7 @@ public class OOXMLParserTest extends Tik
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu Sep 15 16:42:06 2011
@@ -171,6 +171,7 @@ public class PDFParserTest extends TikaT
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);