You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [13/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -8739250869531737584L;
+
+    private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+    private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+    /**
+     * @see OfficeOpenXMLCore#SUBJECT
+     * @deprecated use OfficeOpenXMLCore#SUBJECT
+     */
+    @Deprecated
+    private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+            Property.composite(Office.INITIAL_AUTHOR,
+                    new Property[]{Property.externalText("initial-creator")});
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    private static ContentHandler getMeta(
+            ContentHandler ch, Metadata md, Property property, String element) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:" + element),
+                META_XPATH.parse("//meta:" + element + "//text()"));
+        ContentHandler branch =
+                new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getUserDefined(
+            ContentHandler ch, Metadata md) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:user-defined/@meta:name"),
+                META_XPATH.parse("//meta:user-defined//text()"));
+        // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+                matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    @Deprecated
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, String name, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, Property property, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+        // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+        // Process the Dublin Core Attributes 
+        ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+                getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+        // Process the OO Meta Attributes
+        ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+        // ODF uses dc:date for modified
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "date",
+                md, TikaCoreProperties.MODIFIED));
+
+        // ODF uses dc:subject for description
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "subject",
+                md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+        ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+        ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+        ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+        ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+        ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+        // Process the user defined Meta Attributes
+        ch = getUserDefined(ch, md);
+
+        // Process the OO Statistics Attributes
+        ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+        ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+        // Legacy, Tika-1.0 style attributes
+        // TODO Remove these in Tika 2.0
+        ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+        // Legacy Statistics Attributes, replaced with real keys above
+        // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+        ch = getStatistic(ch, md, "nbPage", "page-count");
+        ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+        ch = getStatistic(ch, md, "nbWord", "word-count");
+        ch = getStatistic(ch, md, "nbCharacter", "character-count");
+        ch = getStatistic(ch, md, "nbTab", "table-count");
+        ch = getStatistic(ch, md, "nbObject", "object-count");
+        ch = getStatistic(ch, md, "nbImg", "image-count");
+
+        // Normalise the rest
+        ch = new NSNormalizerContentHandler(ch);
+        return ch;
+    }
+
+    @Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        super.parse(stream, handler, metadata, context);
+        // Copy subject to description for OO2
+        String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+        if (odfSubject != null && !odfSubject.equals("") &&
+                (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+            metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -6410276875438618287L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.sun.xml.writer"),
+                    MediaType.application("vnd.oasis.opendocument.text"),
+                    MediaType.application("vnd.oasis.opendocument.graphics"),
+                    MediaType.application("vnd.oasis.opendocument.presentation"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("vnd.oasis.opendocument.chart"),
+                    MediaType.application("vnd.oasis.opendocument.image"),
+                    MediaType.application("vnd.oasis.opendocument.formula"),
+                    MediaType.application("vnd.oasis.opendocument.text-master"),
+                    MediaType.application("vnd.oasis.opendocument.text-web"),
+                    MediaType.application("vnd.oasis.opendocument.text-template"),
+                    MediaType.application("vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("vnd.oasis.opendocument.image-template"),
+                    MediaType.application("vnd.oasis.opendocument.formula-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.text"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart"),
+                    MediaType.application("x-vnd.oasis.opendocument.image"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-master"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-web"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.image-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+    private static final String META_NAME = "meta.xml";
+
+    private Parser meta = new OpenDocumentMetaParser();
+
+    private Parser content = new OpenDocumentContentParser();
+
+    public Parser getMetaParser() {
+        return meta;
+    }
+
+    public void setMetaParser(Parser meta) {
+        this.meta = meta;
+    }
+
+    public Parser getContentParser() {
+        return content;
+    }
+
+    public void setContentParser(Parser content) {
+        this.content = content;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler baseHandler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Open the Zip stream
+        // Use a File if we can, and an already open zip is even better
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());
+            } else {
+                zipStream = new ZipInputStream(stream);
+            }
+        } else {
+            zipStream = new ZipInputStream(stream);
+        }
+
+        // Prepare to handle the content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+        // As we don't know which of the metadata or the content
+        //  we'll hit first, catch the endDocument call initially
+        EndDocumentShieldingContentHandler handler =
+                new EndDocumentShieldingContentHandler(xhtml);
+
+        // If we can, process the metadata first, then the
+        //  rest of the file afterwards
+        // Only possible to guarantee that when opened from a file not a stream
+        ZipEntry entry = null;
+        if (zipFile != null) {
+            entry = zipFile.getEntry(META_NAME);
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+
+            Enumeration<? extends ZipEntry> entries = zipFile.entries();
+            while (entries.hasMoreElements()) {
+                entry = entries.nextElement();
+                if (!META_NAME.equals(entry.getName())) {
+                    handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+                }
+            }
+            zipFile.close();
+        } else {
+            do {
+                entry = zipStream.getNextEntry();
+                handleZipEntry(entry, zipStream, metadata, context, handler);
+            } while (entry != null);
+            zipStream.close();
+        }
+
+        // Only now call the end document
+        if (handler.getEndDocumentWasCalled()) {
+            handler.reallyEndDocument();
+        }
+    }
+
+    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+                                ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        if (entry == null) return;
+
+        if (entry.getName().equals("mimetype")) {
+            String type = IOUtils.toString(zip, UTF_8);
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        } else if (entry.getName().equals(META_NAME)) {
+            meta.parse(zip, new DefaultHandler(), metadata, context);
+        } else if (entry.getName().endsWith("content.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        } else if (entry.getName().endsWith("styles.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import org.apache.tika.parser.odf.OpenDocumentParser;
+
+/**
+ * OpenOffice parser
+ *
+ * @deprecated Use the {@link OpenDocumentParser} class instead.
+ *             This class will be removed in Apache Tika 1.0.
+ */
+public class OpenOfficeParser extends OpenDocumentParser {
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.nio.charset.Charset;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+    public int depth;
+    public boolean bold;
+    public boolean italic;
+    // True if we are skipping all text in current group,
+    // eg if group leads with a \*:
+    public boolean ignore;
+    // Default is 1 if no uc control has been seen yet:
+    public int ucSkip = 1;
+    public int list;
+    public int listLevel;
+    public Charset fontCharset;
+    //in objdata
+    public boolean objdata;
+    //depth in pict, 1 = at pict level
+    public int pictDepth;
+    //in picprop key/value pair
+    public boolean sp;
+    //in picprop's name 
+    public boolean sn;
+    //in picprop's value
+    public boolean sv;
+    //in embedded object or not
+    public boolean object;
+
+    // Create default (root) GroupState
+    public GroupState() {
+    }
+
+    // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+    public GroupState(GroupState other) {
+        bold = other.bold;
+        italic = other.italic;
+        ignore = other.ignore;
+        ucSkip = other.ucSkip;
+        list = other.list;
+        listLevel = other.listLevel;
+        fontCharset = other.fontCharset;
+        depth = 1 + other.depth;
+        pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
+        //do not inherit object, sn, sv or sp
+
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/**
+ * Contains the information for a single list in the list or list override tables.
+ */
+public class ListDescriptor {
+    public final static int NUMBER_TYPE_BULLET = 23;
+
+    public int id;
+    // We record this but don't make use if it today:
+    public int templateID;
+    // We record this but don't make use if it today:
+    public boolean isStyle;
+    public int[] numberType = new int[9];
+
+    public boolean isUnordered(int level) {
+        return numberType[level] == NUMBER_TYPE_BULLET;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,287 @@
+package org.apache.tika.parser.rtf; 
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This class buffers data from embedded objects and pictures.
+ * <p/>
+ * <p/>
+ * <p/>
+ * When the parser has finished an object or picture and called
+ * {@link #handleCompletedObject()}, this will write the object
+ * to the {@link #handler}.
+ * <p/>
+ * <p/>
+ * <p/>
+ * This (in combination with TextExtractor) expects basically a flat parse.  It will pull out
+ * all pict whether they are tied to objdata or are intended
+ * to be standalone.
+ * <p/>
+ * <p/>
+ * This tries to pull metadata around a pict that is encoded
+ * with {sp {sn} {sv}} types of data.  This information
+ * sometimes contains the name and even full file path of the original file.
+ */
+class RTFEmbObjHandler {
+
+    private static final String EMPTY_STRING = "";
+    private final ContentHandler handler;
+
+
+    private final ParseContext context;
+    private final ByteArrayOutputStream os;
+    //high hex cached for writing hexpair chars (data)
+    private int hi = -1;
+    private int thumbCount = 0;
+    //don't need atomic, do need mutable
+    private AtomicInteger unknownFilenameCount = new AtomicInteger();
+    private boolean inObject = false;
+    private String sv = EMPTY_STRING;
+    private String sn = EMPTY_STRING;
+    private StringBuilder sb = new StringBuilder();
+    private Metadata metadata;
+    private EMB_STATE state = EMB_STATE.NADA;
+    protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+        this.handler = handler;
+        this.context = context;
+        os = new ByteArrayOutputStream();
+    }
+
+    protected void startPict() {
+        state = EMB_STATE.PICT;
+        metadata = new Metadata();
+    }
+
+    protected void startObjData() {
+        state = EMB_STATE.OBJDATA;
+        metadata = new Metadata();
+    }
+
+    protected void startSN() {
+        sb.setLength(0);
+        sb.append(RTFMetadata.RTF_PICT_META_PREFIX);
+    }
+
+    protected void endSN() {
+        sn = sb.toString();
+    }
+
+    protected void startSV() {
+        sb.setLength(0);
+    }
+
+    protected void endSV() {
+        sv = sb.toString();
+    }
+
+    //end metadata pair
+    protected void endSP() {
+        metadata.add(sn, sv);
+    }
+
+    protected boolean getInObject() {
+        return inObject;
+    }
+
+    protected void setInObject(boolean v) {
+        inObject = v;
+    }
+
+    protected void writeMetadataChar(char c) {
+        sb.append(c);
+    }
+
+    protected void writeHexChar(int b) throws IOException, TikaException {
+        //if not hexchar, ignore
+        //white space is common
+        if (TextExtractor.isHexChar(b)) {
+            if (hi == -1) {
+                hi = 16 * TextExtractor.hexValue(b);
+            } else {
+                long sum = hi + TextExtractor.hexValue(b);
+                if (sum > Integer.MAX_VALUE || sum < 0) {
+                    throw new IOException("hex char to byte overflow");
+                }
+
+                os.write((int) sum);
+
+                hi = -1;
+            }
+            return;
+        }
+        if (b == -1) {
+            throw new TikaException("hit end of stream before finishing byte pair");
+        }
+    }
+
+    protected void writeBytes(InputStream is, int len) throws IOException, TikaException {
+        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+            throw new IOException("length of bytes to read out of bounds: " + len);
+        }
+
+        byte[] bytes = new byte[len];
+        int bytesRead = is.read(bytes);
+        if (bytesRead < len) {
+            throw new TikaException("unexpected end of file: need " + len +
+                    " bytes of binary data, found " + (len - bytesRead));
+        }
+        os.write(bytes);
+    }
+
+    /**
+     * Call this when the objdata/pict has completed
+     *
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    protected void handleCompletedObject() throws IOException, SAXException, TikaException {
+        EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+
+        if (embeddedExtractor == null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+
+        byte[] bytes = os.toByteArray();
+        if (state == EMB_STATE.OBJDATA) {
+            RTFObjDataParser objParser = new RTFObjDataParser();
+            try {
+                byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount);
+                extractObj(objBytes, handler, embeddedExtractor, metadata);
+            } catch (IOException e) {
+                //swallow.  If anything goes wrong, ignore.
+            }
+        } else if (state == EMB_STATE.PICT) {
+            String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription");
+            if (filePath != null && filePath.length() > 0) {
+                metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
+                metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath));
+            }
+            metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+            extractObj(bytes, handler, embeddedExtractor, metadata);
+
+        } else if (state == EMB_STATE.NADA) {
+            //swallow...no start for pict or embed?!
+        }
+        reset();
+    }
+
+    private void extractObj(byte[] bytes, ContentHandler handler,
+                            EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata)
+            throws SAXException, IOException, TikaException {
+
+        if (bytes == null) {
+            return;
+        }
+
+        metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
+
+        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = TikaInputStream.get(bytes);
+            if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
+                String extension = getExtension(stream, metadata);
+                stream.reset();
+                if (inObject && state == EMB_STATE.PICT) {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
+                    metadata.set(RTFMetadata.THUMBNAIL, "true");
+                } else {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() +
+                            extension);
+                }
+            }
+            try {
+                embeddedExtractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(handler),
+                        metadata, false);
+            } finally {
+                stream.close();
+            }
+        }
+    }
+
+    private String getExtension(TikaInputStream is, Metadata metadata) {
+        String cType = metadata.get(Metadata.CONTENT_TYPE);
+        TikaConfig config = getConfig();
+        if (cType == null) {
+            Detector detector = config.getDetector();
+            try {
+                MediaType mediaType = detector.detect(is, metadata);
+                MimeTypes types = config.getMimeRepository();
+                MimeType mime = types.forName(mediaType.toString());
+                metadata.set(Metadata.CONTENT_TYPE, mediaType.getSubtype());
+                return mime.getExtension();
+            } catch (IOException e) {
+                //swallow
+            } catch (MimeTypeException e) {
+
+            }
+        }
+        return ".bin";
+    }
+
+    private TikaConfig getConfig() {
+        TikaConfig config = context.get(TikaConfig.class);
+        if (config == null) {
+            config = TikaConfig.getDefaultConfig();
+        }
+        return config;
+    }
+
+    /**
+     * reset state after each object.
+     * Do not reset unknown file number.
+     */
+    protected void reset() {
+        state = EMB_STATE.NADA;
+        os.reset();
+        metadata = new Metadata();
+        hi = -1;
+        sv = EMPTY_STRING;
+        sn = EMPTY_STRING;
+        sb.setLength(0);
+    }
+
+    private enum EMB_STATE {
+        PICT, //recording pict data
+        OBJDATA, //recording objdata
+        NADA
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.tika.parser.rtf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Many thanks to Simon Mourier for:
+ * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
+ * and for granting permission to use his code in Tika.
+ */
+class RTFObjDataParser {
+
+    private final static int[] INT_LE_POWS = new int[]{
+            1, 256, 65536, 16777216
+    };
+
+    private final static String WIN_ASCII = "WINDOWS-1252";
+
+    /**
+     * Parses the embedded object/pict string
+     *
+     * @param bytes actual bytes (already converted from the 
+     *  hex pair string stored in the embedded object data into actual bytes or read
+     *  as raw binary bytes)
+     * @return a SimpleRTFEmbObj or null
+     * @throws IOException if there are any surprise surprises during parsing
+     */
+
+    /**
+     * @param bytes
+     * @param metadata             incoming metadata
+     * @param unknownFilenameCount
+     * @return byte[] for contents of obj data
+     * @throws IOException
+     */
+    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount)
+            throws IOException {
+        ByteArrayInputStream is = new ByteArrayInputStream(bytes);
+        long version = readUInt(is);
+        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+
+        long formatId = readUInt(is);
+        //2 is an embedded object. 1 is a link.
+        if (formatId != 2L) {
+            return null;
+        }
+        String className = readLengthPrefixedAnsiString(is).trim();
+        String topicName = readLengthPrefixedAnsiString(is).trim();
+        String itemName = readLengthPrefixedAnsiString(is).trim();
+
+        if (className != null && className.length() > 0) {
+            metadata.add(RTFMetadata.EMB_CLASS, className);
+        }
+        if (topicName != null && topicName.length() > 0) {
+            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+        }
+        if (itemName != null && itemName.length() > 0) {
+            metadata.add(RTFMetadata.EMB_ITEM, itemName);
+        }
+
+        long dataSz = readUInt(is);
+
+        //readBytes tests for reading too many bytes
+        byte[] embObjBytes = readBytes(is, dataSz);
+
+        if (className.toLowerCase(Locale.ROOT).equals("package")) {
+            return handlePackage(embObjBytes, metadata);
+        } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
+            //simple bitmap bytes
+            return embObjBytes;
+        } else {
+            ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
+            if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) {
+                try {
+                    return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+        }
+        return embObjBytes;
+    }
+
+
+    //will throw IOException if not actually POIFS
+    //can return null byte[]
+    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata,
+                                       AtomicInteger unknownFilenameCount)
+            throws IOException {
+
+        byte[] ret = null;
+        try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
+
+            DirectoryNode root = fs.getRoot();
+
+            if (root == null) {
+                return ret;
+            }
+
+            if (root.hasEntry("Package")) {
+                Entry ooxml = root.getEntry("Package");
+                TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
+
+                ByteArrayOutputStream out = new ByteArrayOutputStream();
+
+                IOUtils.copy(stream, out);
+                ret = out.toByteArray();
+            } else {
+                //try poifs
+                POIFSDocumentType type = POIFSDocumentType.detectType(root);
+                if (type == POIFSDocumentType.OLE10_NATIVE) {
+                    try {
+                        // Try to un-wrap the OLE10Native record:
+                        Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
+                        ret = ole.getDataBuffer();
+                    } catch (Ole10NativeException ex) {
+                        // Not a valid OLE10Native record, skip it
+                    }
+                } else if (type == POIFSDocumentType.COMP_OBJ) {
+
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
+                    } catch (FileNotFoundException ioe) {
+                        contentsEntry = (DocumentEntry) root.getEntry("Contents");
+                    }
+
+                    try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
+                        ret = new byte[contentsEntry.getSize()];
+                        inp.readFully(ret);
+                    }
+                } else {
+
+                    ByteArrayOutputStream out = new ByteArrayOutputStream();
+                    is.reset();
+                    IOUtils.copy(is, out);
+                    ret = out.toByteArray();
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
+                    metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+                }
+            }
+        }
+        return ret;
+    }
+
+
+    /**
+     * can return null if there is a linked object
+     * instead of an embedded file
+     */
+    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException {
+        //now parse the package header
+        ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
+        readUShort(is);
+
+        String displayName = readAnsiString(is);
+
+        //should we add this to the metadata?
+        readAnsiString(is); //iconFilePath
+        readUShort(is); //iconIndex
+        int type = readUShort(is); //type
+
+        //1 is link, 3 is embedded object
+        //this only handles embedded objects
+        if (type != 3) {
+            return null;
+        }
+        //should we really be ignoring this filePathLen?
+        readUInt(is); //filePathLen
+
+        String ansiFilePath = readAnsiString(is); //filePath
+        long bytesLen = readUInt(is);
+        byte[] objBytes = initByteArray(bytesLen);
+        is.read(objBytes);
+        StringBuilder unicodeFilePath = new StringBuilder();
+
+        try {
+            long unicodeLen = readUInt(is);
+
+            for (int i = 0; i < unicodeLen; i++) {
+                int lo = is.read();
+                int hi = is.read();
+                int sum = lo + 256 * hi;
+                if (hi == -1 || lo == -1) {
+                    //stream ran out; empty SB and stop
+                    unicodeFilePath.setLength(0);
+                    break;
+                }
+                unicodeFilePath.append((char) sum);
+            }
+        } catch (IOException e) {
+            //swallow; the unicode file path is optional and might not happen
+            unicodeFilePath.setLength(0);
+        }
+        String fileNameToUse = "";
+        String pathToUse = "";
+        if (unicodeFilePath.length() > 0) {
+            String p = unicodeFilePath.toString();
+            fileNameToUse = p;
+            pathToUse = p;
+        } else {
+            fileNameToUse = displayName == null ? "" : displayName;
+            pathToUse = ansiFilePath == null ? "" : ansiFilePath;
+        }
+        metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
+
+        return objBytes;
+    }
+
+
+    private int readUShort(InputStream is) throws IOException {
+        int lo = is.read();
+        int hi = is.read() * 256;
+        if (lo == -1 || hi == -1) {
+            throw new IOException("Hit end of stream before reading little endian unsigned short.");
+        }
+        return hi + lo;
+    }
+
+    private long readUInt(InputStream is) throws IOException {
+        long sum = 0;
+        for (int i = 0; i < 4; i++) {
+            int v = is.read();
+            if (v == -1) {
+                throw new IOException("Hit end of stream before finishing little endian unsigned int.");
+            }
+            sum += v * (long) INT_LE_POWS[i];
+        }
+        return sum;
+    }
+
+    private String readAnsiString(InputStream is) throws IOException {
+        StringBuilder sb = new StringBuilder();
+        int c = is.read();
+        while (c > 0) {
+            sb.append((char) c);
+            c = is.read();
+        }
+        if (c == -1) {
+            throw new IOException("Hit end of stream before end of AnsiString");
+        }
+        return sb.toString();
+    }
+
+    private String readLengthPrefixedAnsiString(InputStream is) throws IOException {
+        long len = readUInt(is);
+        byte[] bytes = readBytes(is, len);
+        try {
+            return new String(bytes, WIN_ASCII);
+        } catch (UnsupportedEncodingException e) {
+            //shouldn't ever happen
+            throw new IOException("Unsupported encoding");
+        }
+    }
+
+
+    private byte[] readBytes(InputStream is, long len) throws IOException {
+        //initByteArray tests for "reading of too many bytes"
+        byte[] bytes = initByteArray(len);
+        int read = is.read(bytes);
+        if (read != len) {
+            throw new IOException("Hit end of stream before reading all bytes");
+        }
+
+        return bytes;
+    }
+
+    private byte[] initByteArray(long len) throws IOException {
+        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+            throw new IOException("Requested length for reading bytes is out of bounds: " + len);
+        }
+        return new byte[(int) len];
+
+    }
+}
+

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * RTF parser
+ */
+public class RTFParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -4165069489372320313L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("rtf"));
+    /**
+     * maximum number of bytes per embedded object/pict (default: 20MB)
+     */
+    private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
+
+    /**
+     * See {@link #setMaxBytesForEmbeddedObject(int)}.
+     *
+     * @return maximum number of bytes allowed for an embedded object.
+     */
+    public static int getMaxBytesForEmbeddedObject() {
+        return EMB_OBJ_MAX_BYTES;
+    }
+
+    /**
+     * Bytes for embedded objects are currently cached in memory.
+     * If something goes wrong during the parsing of an embedded object,
+     * it is possible that a read length may be crazily too long
+     * and cause a heap crash.
+     *
+     * @param max maximum number of bytes to allow for embedded objects.  If
+     *            the embedded object has more than this number of bytes, skip it.
+     */
+    public static void setMaxBytesForEmbeddedObject(int max) {
+        EMB_OBJ_MAX_BYTES = max;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
+        TaggedInputStream tagged = new TaggedInputStream(stream);
+        try {
+            XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
+            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+            final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+            ert.extract(stream);
+        } catch (IOException e) {
+            tagged.throwIfCauseOf(e);
+            throw new TikaException("Error parsing an RTF document", e);
+        }
+    }
+}