You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [13/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -8739250869531737584L;
+
+ private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+ private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+ /**
+ * @see OfficeOpenXMLCore#SUBJECT
+ * @deprecated use OfficeOpenXMLCore#SUBJECT
+ */
+ @Deprecated
+ private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+ Property.composite(Office.INITIAL_AUTHOR,
+ new Property[]{Property.externalText("initial-creator")});
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ private static ContentHandler getMeta(
+ ContentHandler ch, Metadata md, Property property, String element) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:" + element),
+ META_XPATH.parse("//meta:" + element + "//text()"));
+ ContentHandler branch =
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getUserDefined(
+ ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ @Deprecated
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, String name, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, Property property, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+ // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+ // Process the Dublin Core Attributes
+ ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+ getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+ // Process the OO Meta Attributes
+ ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+ // ODF uses dc:date for modified
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "date",
+ md, TikaCoreProperties.MODIFIED));
+
+ // ODF uses dc:subject for description
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "subject",
+ md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+ ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+ ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+ ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+ // Process the user defined Meta Attributes
+ ch = getUserDefined(ch, md);
+
+ // Process the OO Statistics Attributes
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+ // Legacy, Tika-1.0 style attributes
+ // TODO Remove these in Tika 2.0
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+ // Legacy Statistics Attributes, replaced with real keys above
+ // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+ ch = getStatistic(ch, md, "nbPage", "page-count");
+ ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+ ch = getStatistic(ch, md, "nbWord", "word-count");
+ ch = getStatistic(ch, md, "nbCharacter", "character-count");
+ ch = getStatistic(ch, md, "nbTab", "table-count");
+ ch = getStatistic(ch, md, "nbObject", "object-count");
+ ch = getStatistic(ch, md, "nbImg", "image-count");
+
+ // Normalise the rest
+ ch = new NSNormalizerContentHandler(ch);
+ return ch;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ super.parse(stream, handler, metadata, context);
+ // Copy subject to description for OO2
+ String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+ if (odfSubject != null && !odfSubject.equals("") &&
+ (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6410276875438618287L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+ MediaType.application("vnd.oasis.opendocument.presentation"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+ MediaType.application("vnd.oasis.opendocument.text-template"),
+ MediaType.application("vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("vnd.oasis.opendocument.chart-template"),
+ MediaType.application("vnd.oasis.opendocument.image-template"),
+ MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+ MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+ MediaType.application("x-vnd.oasis.opendocument.text-template"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+ MediaType.application("x-vnd.oasis.opendocument.image-template"),
+ MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+ private static final String META_NAME = "meta.xml";
+
+ private Parser meta = new OpenDocumentMetaParser();
+
+ private Parser content = new OpenDocumentContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // Prepare to handle the content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+ // As we don't know which of the metadata or the content
+ // we'll hit first, catch the endDocument call initially
+ EndDocumentShieldingContentHandler handler =
+ new EndDocumentShieldingContentHandler(xhtml);
+
+ // If we can, process the metadata first, then the
+ // rest of the file afterwards
+ // Only possible to guarantee that when opened from a file not a stream
+ ZipEntry entry = null;
+ if (zipFile != null) {
+ entry = zipFile.getEntry(META_NAME);
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (!META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+ }
+ zipFile.close();
+ } else {
+ do {
+ entry = zipStream.getNextEntry();
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ } while (entry != null);
+ zipStream.close();
+ }
+
+ // Only now call the end document
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ if (entry == null) return;
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals(META_NAME)) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith("content.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import org.apache.tika.parser.odf.OpenDocumentParser;
+
+/**
+ * OpenOffice parser
+ *
+ * @deprecated Use the {@link OpenDocumentParser} class instead.
+ * This class will be removed in Apache Tika 1.0.
+ */
+public class OpenOfficeParser extends OpenDocumentParser {
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/GroupState.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.nio.charset.Charset;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+ public int depth;
+ public boolean bold;
+ public boolean italic;
+ // True if we are skipping all text in current group,
+ // eg if group leads with a \*:
+ public boolean ignore;
+ // Default is 1 if no uc control has been seen yet:
+ public int ucSkip = 1;
+ public int list;
+ public int listLevel;
+ public Charset fontCharset;
+ //in objdata
+ public boolean objdata;
+ //depth in pict, 1 = at pict level
+ public int pictDepth;
+ //in picprop key/value pair
+ public boolean sp;
+ //in picprop's name
+ public boolean sn;
+ //in picprop's value
+ public boolean sv;
+ //in embedded object or not
+ public boolean object;
+
+ // Create default (root) GroupState
+ public GroupState() {
+ }
+
+ // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+ public GroupState(GroupState other) {
+ bold = other.bold;
+ italic = other.italic;
+ ignore = other.ignore;
+ ucSkip = other.ucSkip;
+ list = other.list;
+ listLevel = other.listLevel;
+ fontCharset = other.fontCharset;
+ depth = 1 + other.depth;
+ pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
+ //do not inherit object, sn, sv or sp
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/**
+ * Contains the information for a single list in the list or list override tables.
+ */
+public class ListDescriptor {
+ public final static int NUMBER_TYPE_BULLET = 23;
+
+ public int id;
+ // We record this but don't make use if it today:
+ public int templateID;
+ // We record this but don't make use if it today:
+ public boolean isStyle;
+ public int[] numberType = new int[9];
+
+ public boolean isUnordered(int level) {
+ return numberType[level] == NUMBER_TYPE_BULLET;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,287 @@
+package org.apache.tika.parser.rtf;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This class buffers data from embedded objects and pictures.
+ * <p/>
+ * <p/>
+ * <p/>
+ * When the parser has finished an object or picture and called
+ * {@link #handleCompletedObject()}, this will write the object
+ * to the {@link #handler}.
+ * <p/>
+ * <p/>
+ * <p/>
+ * This (in combination with TextExtractor) expects basically a flat parse. It will pull out
+ * all pict whether they are tied to objdata or are intended
+ * to be standalone.
+ * <p/>
+ * <p/>
+ * This tries to pull metadata around a pict that is encoded
+ * with {sp {sn} {sv}} types of data. This information
+ * sometimes contains the name and even full file path of the original file.
+ */
+class RTFEmbObjHandler {
+
+ private static final String EMPTY_STRING = "";
+ private final ContentHandler handler;
+
+
+ private final ParseContext context;
+ private final ByteArrayOutputStream os;
+ //high hex cached for writing hexpair chars (data)
+ private int hi = -1;
+ private int thumbCount = 0;
+ //don't need atomic, do need mutable
+ private AtomicInteger unknownFilenameCount = new AtomicInteger();
+ private boolean inObject = false;
+ private String sv = EMPTY_STRING;
+ private String sn = EMPTY_STRING;
+ private StringBuilder sb = new StringBuilder();
+ private Metadata metadata;
+ private EMB_STATE state = EMB_STATE.NADA;
+ protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ this.handler = handler;
+ this.context = context;
+ os = new ByteArrayOutputStream();
+ }
+
+ protected void startPict() {
+ state = EMB_STATE.PICT;
+ metadata = new Metadata();
+ }
+
+ protected void startObjData() {
+ state = EMB_STATE.OBJDATA;
+ metadata = new Metadata();
+ }
+
+ protected void startSN() {
+ sb.setLength(0);
+ sb.append(RTFMetadata.RTF_PICT_META_PREFIX);
+ }
+
+ protected void endSN() {
+ sn = sb.toString();
+ }
+
+ protected void startSV() {
+ sb.setLength(0);
+ }
+
+ protected void endSV() {
+ sv = sb.toString();
+ }
+
+ //end metadata pair
+ protected void endSP() {
+ metadata.add(sn, sv);
+ }
+
+ protected boolean getInObject() {
+ return inObject;
+ }
+
+ protected void setInObject(boolean v) {
+ inObject = v;
+ }
+
+ protected void writeMetadataChar(char c) {
+ sb.append(c);
+ }
+
+ protected void writeHexChar(int b) throws IOException, TikaException {
+ //if not hexchar, ignore
+ //white space is common
+ if (TextExtractor.isHexChar(b)) {
+ if (hi == -1) {
+ hi = 16 * TextExtractor.hexValue(b);
+ } else {
+ long sum = hi + TextExtractor.hexValue(b);
+ if (sum > Integer.MAX_VALUE || sum < 0) {
+ throw new IOException("hex char to byte overflow");
+ }
+
+ os.write((int) sum);
+
+ hi = -1;
+ }
+ return;
+ }
+ if (b == -1) {
+ throw new TikaException("hit end of stream before finishing byte pair");
+ }
+ }
+
+ protected void writeBytes(InputStream is, int len) throws IOException, TikaException {
+ if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+ throw new IOException("length of bytes to read out of bounds: " + len);
+ }
+
+ byte[] bytes = new byte[len];
+ int bytesRead = is.read(bytes);
+ if (bytesRead < len) {
+ throw new TikaException("unexpected end of file: need " + len +
+ " bytes of binary data, found " + (len - bytesRead));
+ }
+ os.write(bytes);
+ }
+
+ /**
+ * Call this when the objdata/pict has completed
+ *
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ protected void handleCompletedObject() throws IOException, SAXException, TikaException {
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+
+ if (embeddedExtractor == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ byte[] bytes = os.toByteArray();
+ if (state == EMB_STATE.OBJDATA) {
+ RTFObjDataParser objParser = new RTFObjDataParser();
+ try {
+ byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount);
+ extractObj(objBytes, handler, embeddedExtractor, metadata);
+ } catch (IOException e) {
+ //swallow. If anything goes wrong, ignore.
+ }
+ } else if (state == EMB_STATE.PICT) {
+ String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription");
+ if (filePath != null && filePath.length() > 0) {
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath));
+ }
+ metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+ extractObj(bytes, handler, embeddedExtractor, metadata);
+
+ } else if (state == EMB_STATE.NADA) {
+ //swallow...no start for pict or embed?!
+ }
+ reset();
+ }
+
+ private void extractObj(byte[] bytes, ContentHandler handler,
+ EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata)
+ throws SAXException, IOException, TikaException {
+
+ if (bytes == null) {
+ return;
+ }
+
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = TikaInputStream.get(bytes);
+ if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
+ String extension = getExtension(stream, metadata);
+ stream.reset();
+ if (inObject && state == EMB_STATE.PICT) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
+ metadata.set(RTFMetadata.THUMBNAIL, "true");
+ } else {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() +
+ extension);
+ }
+ }
+ try {
+ embeddedExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ } finally {
+ stream.close();
+ }
+ }
+ }
+
+ private String getExtension(TikaInputStream is, Metadata metadata) {
+ String cType = metadata.get(Metadata.CONTENT_TYPE);
+ TikaConfig config = getConfig();
+ if (cType == null) {
+ Detector detector = config.getDetector();
+ try {
+ MediaType mediaType = detector.detect(is, metadata);
+ MimeTypes types = config.getMimeRepository();
+ MimeType mime = types.forName(mediaType.toString());
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.getSubtype());
+ return mime.getExtension();
+ } catch (IOException e) {
+ //swallow
+ } catch (MimeTypeException e) {
+
+ }
+ }
+ return ".bin";
+ }
+
+ private TikaConfig getConfig() {
+ TikaConfig config = context.get(TikaConfig.class);
+ if (config == null) {
+ config = TikaConfig.getDefaultConfig();
+ }
+ return config;
+ }
+
+ /**
+ * reset state after each object.
+ * Do not reset unknown file number.
+ */
+ protected void reset() {
+ state = EMB_STATE.NADA;
+ os.reset();
+ metadata = new Metadata();
+ hi = -1;
+ sv = EMPTY_STRING;
+ sn = EMPTY_STRING;
+ sb.setLength(0);
+ }
+
+ private enum EMB_STATE {
+ PICT, //recording pict data
+ OBJDATA, //recording objdata
+ NADA
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.tika.parser.rtf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Many thanks to Simon Mourier for:
+ * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
+ * and for granting permission to use his code in Tika.
+ */
+class RTFObjDataParser {
+
+ private final static int[] INT_LE_POWS = new int[]{
+ 1, 256, 65536, 16777216
+ };
+
+ private final static String WIN_ASCII = "WINDOWS-1252";
+
+ /**
+ * Parses the embedded object/pict string
+ *
+ * @param bytes actual bytes (already converted from the
+ * hex pair string stored in the embedded object data into actual bytes or read
+ * as raw binary bytes)
+ * @return a SimpleRTFEmbObj or null
+ * @throws IOException if there are any surprise surprises during parsing
+ */
+
+ /**
+ * @param bytes
+ * @param metadata incoming metadata
+ * @param unknownFilenameCount
+ * @return byte[] for contents of obj data
+ * @throws IOException
+ */
+ protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount)
+ throws IOException {
+ ByteArrayInputStream is = new ByteArrayInputStream(bytes);
+ long version = readUInt(is);
+ metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+
+ long formatId = readUInt(is);
+ //2 is an embedded object. 1 is a link.
+ if (formatId != 2L) {
+ return null;
+ }
+ String className = readLengthPrefixedAnsiString(is).trim();
+ String topicName = readLengthPrefixedAnsiString(is).trim();
+ String itemName = readLengthPrefixedAnsiString(is).trim();
+
+ if (className != null && className.length() > 0) {
+ metadata.add(RTFMetadata.EMB_CLASS, className);
+ }
+ if (topicName != null && topicName.length() > 0) {
+ metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+ }
+ if (itemName != null && itemName.length() > 0) {
+ metadata.add(RTFMetadata.EMB_ITEM, itemName);
+ }
+
+ long dataSz = readUInt(is);
+
+ //readBytes tests for reading too many bytes
+ byte[] embObjBytes = readBytes(is, dataSz);
+
+ if (className.toLowerCase(Locale.ROOT).equals("package")) {
+ return handlePackage(embObjBytes, metadata);
+ } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
+ //simple bitmap bytes
+ return embObjBytes;
+ } else {
+ ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
+ if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) {
+ try {
+ return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+ return embObjBytes;
+ }
+
+
+ //will throw IOException if not actually POIFS
+ //can return null byte[]
+ private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata,
+ AtomicInteger unknownFilenameCount)
+ throws IOException {
+
+ byte[] ret = null;
+ try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
+
+ DirectoryNode root = fs.getRoot();
+
+ if (root == null) {
+ return ret;
+ }
+
+ if (root.hasEntry("Package")) {
+ Entry ooxml = root.getEntry("Package");
+ TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+
+ IOUtils.copy(stream, out);
+ ret = out.toByteArray();
+ } else {
+ //try poifs
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ // Try to un-wrap the OLE10Native record:
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
+ ret = ole.getDataBuffer();
+ } catch (Ole10NativeException ex) {
+ // Not a valid OLE10Native record, skip it
+ }
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+
+ DocumentEntry contentsEntry;
+ try {
+ contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
+ } catch (FileNotFoundException ioe) {
+ contentsEntry = (DocumentEntry) root.getEntry("Contents");
+ }
+
+ try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
+ ret = new byte[contentsEntry.getSize()];
+ inp.readFully(ret);
+ }
+ } else {
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ is.reset();
+ IOUtils.copy(is, out);
+ ret = out.toByteArray();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ }
+ }
+ }
+ return ret;
+ }
+
+
+ /**
+ * can return null if there is a linked object
+ * instead of an embedded file
+ */
+ private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException {
+ //now parse the package header
+ ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
+ readUShort(is);
+
+ String displayName = readAnsiString(is);
+
+ //should we add this to the metadata?
+ readAnsiString(is); //iconFilePath
+ readUShort(is); //iconIndex
+ int type = readUShort(is); //type
+
+ //1 is link, 3 is embedded object
+ //this only handles embedded objects
+ if (type != 3) {
+ return null;
+ }
+ //should we really be ignoring this filePathLen?
+ readUInt(is); //filePathLen
+
+ String ansiFilePath = readAnsiString(is); //filePath
+ long bytesLen = readUInt(is);
+ byte[] objBytes = initByteArray(bytesLen);
+ is.read(objBytes);
+ StringBuilder unicodeFilePath = new StringBuilder();
+
+ try {
+ long unicodeLen = readUInt(is);
+
+ for (int i = 0; i < unicodeLen; i++) {
+ int lo = is.read();
+ int hi = is.read();
+ int sum = lo + 256 * hi;
+ if (hi == -1 || lo == -1) {
+ //stream ran out; empty SB and stop
+ unicodeFilePath.setLength(0);
+ break;
+ }
+ unicodeFilePath.append((char) sum);
+ }
+ } catch (IOException e) {
+ //swallow; the unicode file path is optional and might not happen
+ unicodeFilePath.setLength(0);
+ }
+ String fileNameToUse = "";
+ String pathToUse = "";
+ if (unicodeFilePath.length() > 0) {
+ String p = unicodeFilePath.toString();
+ fileNameToUse = p;
+ pathToUse = p;
+ } else {
+ fileNameToUse = displayName == null ? "" : displayName;
+ pathToUse = ansiFilePath == null ? "" : ansiFilePath;
+ }
+ metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
+
+ return objBytes;
+ }
+
+
+ private int readUShort(InputStream is) throws IOException {
+ int lo = is.read();
+ int hi = is.read() * 256;
+ if (lo == -1 || hi == -1) {
+ throw new IOException("Hit end of stream before reading little endian unsigned short.");
+ }
+ return hi + lo;
+ }
+
+ private long readUInt(InputStream is) throws IOException {
+ long sum = 0;
+ for (int i = 0; i < 4; i++) {
+ int v = is.read();
+ if (v == -1) {
+ throw new IOException("Hit end of stream before finishing little endian unsigned int.");
+ }
+ sum += v * (long) INT_LE_POWS[i];
+ }
+ return sum;
+ }
+
+ private String readAnsiString(InputStream is) throws IOException {
+ StringBuilder sb = new StringBuilder();
+ int c = is.read();
+ while (c > 0) {
+ sb.append((char) c);
+ c = is.read();
+ }
+ if (c == -1) {
+ throw new IOException("Hit end of stream before end of AnsiString");
+ }
+ return sb.toString();
+ }
+
+ private String readLengthPrefixedAnsiString(InputStream is) throws IOException {
+ long len = readUInt(is);
+ byte[] bytes = readBytes(is, len);
+ try {
+ return new String(bytes, WIN_ASCII);
+ } catch (UnsupportedEncodingException e) {
+ //shouldn't ever happen
+ throw new IOException("Unsupported encoding");
+ }
+ }
+
+
+ private byte[] readBytes(InputStream is, long len) throws IOException {
+ //initByteArray tests for "reading of too many bytes"
+ byte[] bytes = initByteArray(len);
+ int read = is.read(bytes);
+ if (read != len) {
+ throw new IOException("Hit end of stream before reading all bytes");
+ }
+
+ return bytes;
+ }
+
+ private byte[] initByteArray(long len) throws IOException {
+ if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+ throw new IOException("Requested length for reading bytes is out of bounds: " + len);
+ }
+ return new byte[(int) len];
+
+ }
+}
+
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * RTF parser
+ */
+public class RTFParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -4165069489372320313L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("rtf"));
+ /**
+ * maximum number of bytes per embedded object/pict (default: 20MB)
+ */
+ private static int EMB_OBJ_MAX_BYTES = 20 * 1024 * 1024; //20MB
+
+ /**
+ * See {@link #setMaxBytesForEmbeddedObject(int)}.
+ *
+ * @return maximum number of bytes allowed for an embedded object.
+ */
+ public static int getMaxBytesForEmbeddedObject() {
+ return EMB_OBJ_MAX_BYTES;
+ }
+
+ /**
+ * Bytes for embedded objects are currently cached in memory.
+ * If something goes wrong during the parsing of an embedded object,
+ * it is possible that a read length may be crazily too long
+ * and cause a heap crash.
+ *
+ * @param max maximum number of bytes to allow for embedded objects. If
+ * the embedded object has more than this number of bytes, skip it.
+ */
+ public static void setMaxBytesForEmbeddedObject(int max) {
+ EMB_OBJ_MAX_BYTES = max;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
+ TaggedInputStream tagged = new TaggedInputStream(stream);
+ try {
+ XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
+ RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+ final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+ ert.extract(stream);
+ } catch (IOException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("Error parsing an RTF document", e);
+ }
+ }
+}