You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [30/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,808 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iptc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Set;
+import java.util.TimeZone;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Parser for IPTC ANPA New Wire Feeds
+ */
+public class IptcAnpaParser implements Parser {
+ /** Serial version UID */
+ private static final long serialVersionUID = -6062820170212879115L;
+
+ private static final MediaType TYPE =
+ MediaType.text("vnd.iptc.anpa");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ HashMap<String,String> properties = this.loadProperties(stream);
+ this.setMetadata(metadata, properties);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ // TODO: put body content here
+ xhtml.startElement("p");
+ String body = clean(properties.get("body"));
+ if (body != null)
+ xhtml.characters(body);
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+
+ private int FMT_ANPA_1312 = 0x00; // "NAA 89-3 (ANPA 1312)"
+ private int FMT_ANPA_UPI = 0x01; // "United Press International ANPA 1312 variant"
+ private int FMT_ANPA_UPI_DL = 0x02; // "United Press International Down-Load Message"
+ private int FMT_IPTC_7901 = 0x03; // "IPTC7901 Recommended Message Format"
+ private int FMT_IPTC_PHOTO = 0x04; // "IPTC-NAA Digital Newsphoto Parameter Record"
+ private int FMT_IPTC_CHAR = 0x05; // "IPTC Unstructured Character Oriented File Format (UCOFF)"
+ private int FMT_NITF = 0x06; // "News Industry Text Format (NITF)"
+ private int FMT_NITF_TT = 0x07; // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)"
+ private int FMT_NITF_RB = 0x08; // "Ritzaus Bureau NITF version (RBNITF DTD)"
+ private int FMT_IPTC_AP = 0x09; // "Associated Press news wire format"
+ private int FMT_IPTC_BLM = 0x0A; // "Bloomberg News news wire format"
+ private int FMT_IPTC_NYT = 0x0B; // "New York Times news wire format"
+ private int FMT_IPTC_RTR = 0x0C; // "Reuters news wire format"
+
+ private int FORMAT = FMT_ANPA_1312; // assume the default format to be ANPA-1312
+
+ private final static char SOH = 0x01; // start of header (ctrl-a)
+ private final static char STX = 0x02; // start of text (ctrl-b)
+ private final static char ETX = 0x03; // end of text (ctrl-c)
+ private final static char EOT = 0x04; // the tab character (ctrl-d)
+ private final static char SYN = 0x16; // synchronous idle (ctrl-v)
+
+ private final static char BS = 0x08; // the backspace character (used for diacriticals)
+ private final static char TB = 0x09; // the tab character
+ private final static char LF = 0x0A; // line feed
+ private final static char FF = 0x0C; // form feed
+ private final static char CR = 0x0D; // carriage return
+ private final static char XQ = 0x11; // device control (ctrl-q)
+ private final static char XS = 0x13; // device control (ctrl-s)
+ private final static char FS = 0x1F; // a field delimiter
+
+ private final static char HY = 0x2D; // hyphen
+ private final static char SP = 0x20; // the blank space
+ private final static char LT = 0x3C; // less than
+ private final static char EQ = 0x3D; // less than
+ private final static char CT = 0x5E; // carat
+
+ private final static char SL = 0x91; // single-quote left
+ private final static char SR = 0x92; // single-quote right
+ private final static char DL = 0x93; // double-quote left
+ private final static char DR = 0x94; // double-quote right
+
+
+ /**
+ * scan the news messsage and store the metadata and data into a map
+ */
+ private HashMap<String,String> loadProperties(InputStream is) {
+
+ HashMap<String,String> properties = new HashMap<String,String>();
+
+ FORMAT = this.scanFormat(is);
+
+ byte[] residual = this.getSection(is,"residual");
+
+ byte[] header = this.getSection(is,"header");
+ parseHeader(header, properties);
+
+ byte[] body = this.getSection(is,"body");
+ parseBody(body, properties);
+
+ byte[] footer = this.getSection(is,"footer");
+ parseFooter(footer, properties);
+
+ return (properties);
+ }
+
+
+ private int scanFormat(InputStream is) {
+ int format = this.FORMAT;
+ int maxsize = 524288; // 512K
+
+ byte[] buf = new byte[maxsize];
+ try {
+ if (is.markSupported()) {
+ is.mark(maxsize);
+ }
+ int msgsize = is.read(buf); // read in at least the full data
+
+ String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT);
+ // these are not if-then-else, because we want to go from most common
+ // and fall through to least. this is imperfect, as these tags could
+ // show up in other agency stories, but i can't find a spec or any
+ // explicit codes to identify the wire source in the message itself
+
+ if (message.contains("ap-wf")) {
+ format = this.FMT_IPTC_AP;
+ }
+ if (message.contains("reuters")) {
+ format = this.FMT_IPTC_RTR;
+ }
+ if (message.contains("new york times")) {
+ format = this.FMT_IPTC_NYT;
+ }
+ if (message.contains("bloomberg news")) {
+ format = this.FMT_IPTC_BLM;
+ }
+ }
+ catch (IOException eio) {
+ // we are in an unstable state
+ }
+
+ try {
+ if (is.markSupported()) {
+ is.reset();
+ }
+ }
+ catch (IOException eio) {
+ // we are in an unstable state
+ }
+ return (format);
+ }
+
+
+ private void setFormat(int format) {
+ this.FORMAT = format;
+ }
+
+
+ private String getFormatName() {
+
+ String name = "";
+
+ if (FORMAT == this.FMT_IPTC_AP) {
+ name = "Associated Press";
+ }
+
+ else if(FORMAT == this.FMT_IPTC_BLM) {
+ name = "Bloomberg";
+ }
+
+ else if(FORMAT == this.FMT_IPTC_NYT) {
+ name = "New York Times";
+ }
+
+ else if(FORMAT == this.FMT_IPTC_RTR) {
+ name = "Reuters";
+ }
+
+ return (name);
+ }
+
+
+ private byte[] getSection(InputStream is, String name) {
+
+ byte[] value = new byte[0];
+
+ if (name.equals("residual")) {
+ // the header shouldn't be more than 1k, but just being generous here
+ int maxsize = 8192; // 8K
+ byte bstart = SYN; // check for SYN [0x16 : ctrl-v] (may have leftover residue from preceding message)
+ byte bfinish = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
+ value = getSection(is, maxsize, bstart, bfinish, true);
+ }
+
+ else if(name.equals("header")) {
+ // the header shouldn't be more than 1k, but just being generous here
+ int maxsize = 8192; // 8K
+ byte bstart = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
+ byte bfinish = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
+ value = getSection(is, maxsize, bstart, bfinish, true);
+ }
+
+ else if (name.equals("body")) {
+ // the message shouldn't be more than 16k (?), leaving plenty of space
+ int maxsize = 524288; // 512K
+ byte bstart = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
+ byte bfinish = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
+ value = getSection(is, maxsize, bstart, bfinish, true);
+ }
+
+ else if (name.equals("footer")) {
+ // the footer shouldn't be more than 1k , leaving plenty of space
+ int maxsize = 8192; // 8K
+ byte bstart = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
+ byte bfinish = EOT; // check for EOT [0x04 : ctrl-d] (marks end of transmission)
+ value = getSection(is, maxsize, bstart, bfinish, true);
+ }
+
+ return (value);
+ }
+
+
+ private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish, boolean ifincomplete) {
+ byte[] value = new byte[0];
+
+ try {
+ boolean started = false; // check if we have found the start flag
+ boolean finished = false; // check if we have found the finish flag
+ int read = 0; // the number of bytes we read
+ int start = 0; // the position after the start flag
+
+ // TODO: this only pulls back 8K of data on a read, regardless of buffer size
+ // more nefariously, it caps at a total 8K, through all sections
+ int streammax = is.available();
+ maxsize = Math.min(maxsize, streammax);
+
+ is.mark(maxsize);
+ byte[] buf = new byte[maxsize];
+ int totsize = 0;
+ int remainder = maxsize - totsize;
+ while (remainder > 0) {
+ int msgsize = is.read(buf, maxsize-remainder, maxsize); // read in at least the full data
+ if (msgsize == -1) {
+ remainder = msgsize = 0;
+ }
+ remainder -= msgsize;
+ totsize += msgsize;
+ }
+
+ // scan through the provided input stream
+ for (read=0; read < totsize; read++) {
+ byte b = buf[read];
+
+ if (!started) {
+ started = (b == bstart);
+ start = read + 1;
+ continue;
+ }
+
+ if (finished = (b == bfinish)) {
+/*
+ is.reset();
+ long skipped = is.skip((long)read);
+ if (skipped != read) {
+ // we are in an unstable state
+ }
+ is.mark(1);
+ */
+ break;
+ }
+
+ // load from the stream until we run out of characters, or hit the termination byte
+ continue;
+ }
+
+ // move the input stream back to where it was initially
+ is.reset();
+
+ if (finished) {
+ // now, we want to reset the stream to be sitting right on top of the finish marker
+ is.skip(read);
+ value = new byte[read-start];
+ System.arraycopy(buf, start, value, 0, read-start);
+ }
+ else {
+ if (ifincomplete && started) {
+ // the caller wants anything that was read, and we finished the stream or buffer
+ value = new byte[read-start];
+ System.arraycopy(buf, start, value, 0, read-start);
+ }
+ }
+ }
+ catch (IOException eio) {
+ // something invalid occurred, return an empty string
+ }
+
+ return (value);
+ }
+
+
+ private boolean parseHeader(byte[] value, HashMap<String,String> properties) {
+ boolean added = false;
+
+ String env_serviceid = "";
+ String env_category = "";
+ String env_urgency = "";
+ String hdr_edcode = "";
+ String hdr_subject = "";
+ String hdr_date = "";
+ String hdr_time = "";
+
+ int read = 0;
+
+ while (read < value.length) {
+
+ // pull apart the envelope, getting the service id (....\x1f)
+ while (read < value.length) {
+ byte val_next = value[read++];
+ if (val_next != FS) {
+ env_serviceid += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ }
+ else {
+ break;
+ }
+ }
+
+ // pull apart the envelope, getting the category (....\x13\x11)
+ while (read < value.length) {
+ byte val_next = value[read++];
+ if (val_next != XS) { // the end of the envelope is marked (\x13)
+ env_category += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ }
+ else {
+ val_next = value[read]; // get the remaining byte (\x11)
+ if (val_next == XQ) {
+ read++;
+ }
+ break;
+ }
+ }
+
+ // pull apart the envelope, getting the subject heading
+ while (read < value.length) {
+ boolean subject = true;
+ byte val_next = value[read++];
+ while ((subject) && (val_next != SP) && (val_next != 0x00)) { // ignore the envelope subject
+ hdr_subject += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ while (val_next == SP) { // consume all the spaces
+ subject = false;
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ if (val_next != SP) {
+ --read; // otherwise we eat into the next section
+ }
+ }
+ }
+ if (!subject) {
+ break;
+ }
+ }
+
+ // pull apart the envelope, getting the date and time
+ while (read < value.length) {
+ byte val_next = value[read++];
+ if (hdr_date.length() == 0) {
+ while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens
+ || (val_next == HY)) {
+ hdr_date += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+ }
+ else if (val_next == SP) {
+ while (val_next == SP) { // consume all the spaces
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+ continue;
+ }
+ else {
+ while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens
+ || (val_next == HY)) {
+ hdr_time += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+ }
+ }
+ break; // don't let this run back through and start thrashing metadata
+ }
+
+ // if we were saving any of these values, we would set the properties map here
+
+ added = (env_serviceid.length() + env_category.length() + hdr_subject.length() +
+ hdr_date.length() + hdr_time.length()) > 0;
+ return added;
+ }
+
+ private boolean parseBody(byte[] value, HashMap<String,String> properties) {
+ boolean added = false;
+
+ String bdy_heading = "";
+ String bdy_title = "";
+ String bdy_source = "";
+ String bdy_author = "";
+ String bdy_body = "";
+
+ int read = 0;
+ boolean done = false;
+
+ while (!done && (read < value.length)) {
+
+ // pull apart the body, getting the heading (^....\x0d\x0a)
+ while (read < value.length) {
+ byte val_next = value[read++];
+ if (val_next == CT) { // start of a new section , first is the heading
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ // AP, NYT, and Bloomberg end with < , Reuters with EOL
+ while ((val_next != LT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c) and not EOL
+ bdy_heading += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+ if (val_next == LT) {
+ // hit the delimiter, carry on
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+ while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
+ val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
+ if ((val_next != CR) && (val_next != LF)) {
+ --read;
+ }
+ }
+ }
+ else {
+ // this will only be hit on poorly-formed files
+
+ // for reuters, the heading does not start with the ^, so we push one back into the stream
+ if (FORMAT == this.FMT_IPTC_RTR) {
+ if (val_next != CT) {
+ // for any non-whitespace, we need to go back an additional step to non destroy the data
+ if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
+ // if the very first byte is data, we have to shift the whole array, and stuff in a carat
+ if (read == 1) {
+ byte[] resize = new byte[value.length + 1];
+ System.arraycopy(value, 0, resize, 1, value.length);
+ value = resize;
+ }
+ }
+ value[--read] = CT;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+
+ // pull apart the body, getting the title (^....\x0d\x0a)
+ while (read < value.length) {
+ byte val_next = value[read++];
+ if (val_next == CT) { // start of a new section , first is the heading
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ // AP, NYT, and Bloomberg end with < , Reuters with EOL
+ while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL
+ bdy_title += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+
+ if (val_next == CT) { // start of a new section , when first didn't finish cleanly
+ --read;
+ }
+
+ if (val_next == LT) {
+ // hit the delimiter, carry on
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+
+ while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
+ val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
+ if ((val_next != CR) && (val_next != LF)) {
+ --read;
+ }
+ }
+ }
+ else {
+ // this will only be hit on poorly-formed files
+
+ // for bloomberg, the title does not start with the ^, so we push one back into the stream
+ if (FORMAT == this.FMT_IPTC_BLM) {
+ if (val_next == TB) {
+ value[--read] = CT;
+ continue;
+ }
+ }
+
+ // for reuters, the title does not start with the ^, so we push one back into the stream
+ if (FORMAT == this.FMT_IPTC_RTR) {
+ if (val_next != CT) {
+ // for any non-whitespace, we need to go back an additional step to non destroy the data
+ if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
+ --read;
+ }
+ value[--read] = CT;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+
+
+ // at this point, we have a variable number of metadata lines, with various orders
+ // we scan the start of each line for the special character, and run to the end character
+ // pull apart the body, getting the title (^....\x0d\x0a)
+ boolean metastarted = false;
+ String longline = "";
+ String longkey = "";
+ while (read < value.length) {
+ byte val_next = value[read++];
+
+ // eat up whitespace before committing to the next section
+ if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
+ continue;
+ }
+
+ if (val_next == CT) { // start of a new section , could be authors, sources, etc
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ String tmp_line = "";
+ while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) {
+ // less than delimiter (\x3c), maybe also badly formed with just new line
+ tmp_line += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+
+ if (val_next == CT) { // start of a new section , when first didn't finish cleanly
+ --read;
+ }
+
+ if (val_next == LT) {
+ // hit the delimiter, carry on
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+
+ while ((val_next == CR) || (val_next == LF)) {
+ val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
+ if ((val_next != CR) && (val_next != LF)) {
+ --read;
+ }
+ }
+ if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || longline.equals("bdy_author")) {
+ longkey = "bdy_author";
+
+ // prepend a space to subsequent line, so it gets parsed consistent with the lead line
+ tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+ // we have an author candidate
+ int term = tmp_line.length();
+ term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term));
+ term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term));
+ term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term));
+ term = (term > 0 ) ? term : tmp_line.length();
+ bdy_author += tmp_line.substring(tmp_line.indexOf(" "), term);
+ metastarted = true;
+ longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : "");
+ }
+ else if (FORMAT == this.FMT_IPTC_BLM) {
+ String byline = " by ";
+ if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) {
+ longkey = "bdy_author";
+
+ int term = tmp_line.length();
+ term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term));
+ term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term));
+ term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term));
+ term = (term > 0 ) ? term : tmp_line.length();
+ // for bloomberg, the author line sits below their copyright statement
+ bdy_author += tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + byline.length(), term) + " ";
+ metastarted = true;
+ longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : "");
+ }
+ else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) {
+ // the author line for bloomberg is a multiline starting with c.2011 Bloomberg News
+ // then containing the author info on the next line
+ if (val_next == TB) {
+ value[--read] = CT;
+ continue;
+ }
+ }
+ else if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) {
+ // the author line may have one or more comment lines between the copyright
+ // statement, and the By AUTHORNAME line
+ if (val_next == TB) {
+ value[--read] = CT;
+ continue;
+ }
+ }
+ }
+
+ else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || longline.equals("bdy_source")) {
+ longkey = "bdy_source";
+ // prepend a space to subsequent line, so it gets parsed consistent with the lead line
+ tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+ // we have a source candidate
+ int term = tmp_line.length();
+ term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term));
+ term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term));
+// term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term));
+ term = (term > 0 ) ? term : tmp_line.length();
+ bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, term) + " ";
+ metastarted = true;
+ longline = (!longline.equals(longkey) ? longkey : "");
+ }
+ else {
+ // this has fallen all the way through. trap it as part of the subject,
+ // rather than just losing it
+ if (!metastarted) {
+ bdy_title += " , " + tmp_line; // not sure where else to put this but in the title
+ }
+ else {
+ // what to do with stuff that is metadata, which falls after metadata lines started?
+ bdy_body += " " + tmp_line + " , "; // not sure where else to put this but in the title
+ }
+ }
+ }
+ else { // we're on to the main body
+ while ((read < value.length) && (val_next != 0)) {
+ // read until the train runs out of tracks
+ bdy_body += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+
+ }
+ // we would normally break here, but just let this read out to the end
+ }
+ done = true; // don't let this run back through and start thrashing metadata
+ }
+ properties.put("body", bdy_body);
+ properties.put("title", bdy_title);
+ properties.put("subject", bdy_heading);
+ properties.put("author", bdy_author);
+ properties.put("source", bdy_source);
+
+ added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + bdy_author.length() +
+ bdy_source.length()) > 0;
+ return added;
+ }
+
+
+ private boolean parseFooter(byte[] value, HashMap<String,String> properties) {
+ boolean added = false;
+
+ String ftr_source = "";
+ String ftr_datetime = "";
+
+ int read = 0;
+ boolean done = false;
+
+ while (!done && (read < value.length)) {
+
+ // pull apart the footer, getting the news feed source (^....\x0d\x0a)
+ byte val_next = value[read++];
+ byte val_peek = (read < value.length) ? value[read+1] : 0x00; // skip the new lines
+
+ while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && (val_next != 0)) { // consume all non-numerics first
+ ftr_source += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read] : 0x00; // attempt to read until end of stream
+ read++;
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+
+ while ((val_next != LT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) { // get as much timedate as possible
+ // this is an american format, so arrives as mm-dd-yy HHiizzz
+ ftr_datetime += (char)(val_next & 0xff); // convert the byte to an unsigned int
+ val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
+ if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
+ }
+ if (val_next == LT) {
+ // hit the delimiter, carry on
+ val_next = (read < value.length) ? value[read++] : 0x00;
+ }
+
+ if (ftr_datetime.length() > 0) {
+ // we want to pass this back in a more friendly format
+ String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
+ Date dateunix = new Date();
+ try {
+ // standard ap format
+ String format_in = "MM-dd-yy HHmmzzz";
+
+ if (FORMAT == this.FMT_IPTC_RTR) {
+ // standard reuters format
+ format_in = "HH:mm MM-dd-yy";
+ }
+ SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT);
+ dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
+ dateunix = dfi.parse(ftr_datetime);
+ }
+ catch (ParseException ep) {
+ // failed, but this will just fall through to setting the date to now
+ }
+ SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT);
+ dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
+ ftr_datetime = dfo.format(dateunix);
+ }
+ while ((val_next == CR) || (val_next == LF)) {
+ val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
+ if ((val_next != CR) && (val_next != LF)) {
+ --read;
+ }
+ }
+ done = true; // don't let this run back through and start thrashing metadata
+ }
+
+ properties.put("publisher", ftr_source);
+ properties.put("created", ftr_datetime);
+ properties.put("modified", ftr_datetime);
+
+ added = (ftr_source.length() + ftr_datetime.length()) > 0;
+ return added;
+ }
+
+
+ private void setMetadata(Metadata metadata, HashMap<String,String> properties) {
+
+ // every property that gets set must be non-null, or it will cause NPE
+ // in other consuming applications, like Lucene
+ metadata.set(Metadata.CONTENT_TYPE, clean("text/anpa-1312"));
+ metadata.set(TikaCoreProperties.TITLE, clean(properties.get("title")));
+ metadata.set(TikaCoreProperties.KEYWORDS, clean(properties.get("subject")));
+ metadata.set(TikaCoreProperties.CREATOR, clean(properties.get("author")));
+ metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created")));
+ metadata.set(TikaCoreProperties.MODIFIED, clean(properties.get("modified")));
+ metadata.set(TikaCoreProperties.SOURCE, clean(properties.get("source")));
+// metadata.set(TikaCoreProperties.PUBLISHER, clean(properties.get("publisher")));
+ metadata.set(TikaCoreProperties.PUBLISHER, clean(this.getFormatName()));
+
+/*
+ metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime());
+ metadata.set(
+ Property.internalDate(TikaCoreProperties.MODIFIED),
+ font.getHeader().getModified().getTime());
+*/
+ }
+
+ private String clean(String value) {
+ if (value == null) {
+ value = "";
+ }
+
+ value = value.replaceAll("``", "`");
+ value = value.replaceAll("''", "'");
+ value = value.replaceAll(new String(new char[] {SL}), "'");
+ value = value.replaceAll(new String(new char[] {SR}), "'");
+ value = value.replaceAll(new String(new char[] {DL}), "\"");
+ value = value.replaceAll(new String(new char[] {DR}), "\"");
+ value = value.trim();
+
+ return (value);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.parser.ContentHandler;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
+ */
+class MailContentHandler implements ContentHandler {
+
+ private boolean strictParsing = false;
+
+ private XHTMLContentHandler handler;
+ private Metadata metadata;
+ private EmbeddedDocumentExtractor extractor;
+
+ private boolean inPart = false;
+
+ MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
+ this.handler = xhtml;
+ this.metadata = metadata;
+ this.strictParsing = strictParsing;
+
+ // Fetch / Build an EmbeddedDocumentExtractor with which
+ // to handle/process the parts/attachments
+
+ // Was an EmbeddedDocumentExtractor explicitly supplied?
+ this.extractor = context.get(EmbeddedDocumentExtractor.class);
+
+ // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+ // This will ensure that the contents are made available to the user, so
+ // the see the text, but without fine-grained control/extraction
+ // (This also maintains backward compatibility with older versions!)
+ if (this.extractor == null) {
+ // If the user gave a parser, use that, if not the default
+ Parser parser = context.get(AutoDetectParser.class);
+ if (parser == null) {
+ parser = context.get(Parser.class);
+ }
+ if (parser == null) {
+ TikaConfig tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ parser = new AutoDetectParser(tikaConfig.getParser());
+ }
+ ParseContext ctx = new ParseContext();
+ ctx.set(Parser.class, parser);
+ extractor = new ParsingEmbeddedDocumentExtractor(ctx);
+ }
+ }
+
+ public void body(BodyDescriptor body, InputStream is) throws MimeException,
+ IOException {
+ // use a different metadata object
+ // in order to specify the mime type of the
+ // sub part without damaging the main metadata
+
+ Metadata submd = new Metadata();
+ submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+ submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+ try {
+ if (extractor.shouldParseEmbedded(submd)) {
+ extractor.parseEmbedded(is, handler, submd, false);
+ }
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endBodyPart() throws MimeException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endHeader() throws MimeException {
+ }
+
+ public void startMessage() throws MimeException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMessage() throws MimeException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMultipart() throws MimeException {
+ inPart = false;
+ }
+
+ public void epilogue(InputStream is) throws MimeException, IOException {
+ }
+
+ /**
+ * Header for the whole message or its parts
+ *
+ * @see http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
+ * Field.html
+ */
+ public void field(Field field) throws MimeException {
+ // inPart indicates whether these metadata correspond to the
+ // whole message or its parts
+ if (inPart) {
+ return;
+ }
+
+ try {
+ String fieldname = field.getName();
+ ParsedField parsedField = LenientFieldParser.getParser().parse(
+ field, DecodeMonitor.SILENT);
+ if (fieldname.equalsIgnoreCase("From")) {
+ MailboxListField fromField = (MailboxListField) parsedField;
+ MailboxList mailboxList = fromField.getMailboxList();
+ if (fromField.isValidField() && mailboxList != null) {
+ for (Address address : mailboxList) {
+ String from = getDisplayString(address);
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else {
+ String from = stripOutFieldPrefix(field, "From:");
+ if (from.startsWith("<")) {
+ from = from.substring(1);
+ }
+ if (from.endsWith(">")) {
+ from = from.substring(0, from.length() - 1);
+ }
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else if (fieldname.equalsIgnoreCase("Subject")) {
+ metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
+ ((UnstructuredField) parsedField).getValue());
+ } else if (fieldname.equalsIgnoreCase("To")) {
+ processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+ } else if (fieldname.equalsIgnoreCase("CC")) {
+ processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+ } else if (fieldname.equalsIgnoreCase("BCC")) {
+ processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+ } else if (fieldname.equalsIgnoreCase("Date")) {
+ DateTimeField dateField = (DateTimeField) parsedField;
+ metadata.set(TikaCoreProperties.CREATED, dateField.getDate());
+ }
+ } catch (RuntimeException me) {
+ if (strictParsing) {
+ throw me;
+ }
+ }
+ }
+
+ private void processAddressList(ParsedField field, String addressListType,
+ String metadataField) throws MimeException {
+ AddressListField toField = (AddressListField) field;
+ if (toField.isValidField()) {
+ AddressList addressList = toField.getAddressList();
+ for (Address address : addressList) {
+ metadata.add(metadataField, getDisplayString(address));
+ }
+ } else {
+ String to = stripOutFieldPrefix(field,
+ addressListType);
+ for (String eachTo : to.split(",")) {
+ metadata.add(metadataField, eachTo.trim());
+ }
+ }
+ }
+
+ private String getDisplayString(Address address) {
+ if (address instanceof Mailbox) {
+ Mailbox mailbox = (Mailbox) address;
+ String name = mailbox.getName();
+ if (name != null && name.length() > 0) {
+ name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+ return name + " <" + mailbox.getAddress() + ">";
+ } else {
+ return mailbox.getAddress();
+ }
+ } else {
+ return address.toString();
+ }
+ }
+
+ public void preamble(InputStream is) throws MimeException, IOException {
+ }
+
+ public void raw(InputStream is) throws MimeException, IOException {
+ }
+
+ public void startBodyPart() throws MimeException {
+ try {
+ handler.startElement("div", "class", "email-entry");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void startHeader() throws MimeException {
+ // TODO Auto-generated method stub
+
+ }
+
+ public void startMultipart(BodyDescriptor descr) throws MimeException {
+ inPart = true;
+ }
+
+ private String stripOutFieldPrefix(Field field, String fieldname) {
+ String temp = field.getRaw().toString();
+ int loc = fieldname.length();
+ while (temp.charAt(loc) == ' ') {
+ loc++;
+ }
+ return temp.substring(loc);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within elements.
+ * <p/>
+ * A {@link MimeEntityConfig} object can be passed in the parsing context
+ * to better control the parsing process.
+ *
+ * @author jnioche@digitalpebble.com
+ */
+public class RFC822Parser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -5504243905998074168L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.parse("message/rfc822"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ // Get the mime4j configuration, or use a default one
+ MimeConfig config = new MimeConfig();
+ config.setMaxLineLen(100000);
+ config.setMaxHeaderLen(100000); // max length of any individual header
+ config = context.get(MimeConfig.class, config);
+
+ MimeStreamParser parser = new MimeStreamParser(config);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ MailContentHandler mch = new MailContentHandler(
+ xhtml, metadata, context, config.isStrictParsing());
+ parser.setContentHandler(mch);
+ parser.setContentDecoding(true);
+
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ try {
+ parser.parse(tstream);
+ } catch (IOException e) {
+ tstream.throwIfCauseOf(e);
+ throw new TikaException("Failed to parse an email message", e);
+ } catch (MimeException e) {
+ // Unwrap the exception in case it was not thrown by mime4j
+ Throwable cause = e.getCause();
+ if (cause instanceof TikaException) {
+ throw (TikaException) cause;
+ } else if (cause instanceof SAXException) {
+ throw (SAXException) cause;
+ } else {
+ throw new TikaException("Failed to parse an email message", e);
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ public static final int MAIL_MAX_SIZE = 50000000;
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1762689436731160661L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+ private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+ private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+ private boolean tracking = false;
+
+ public static Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ String charsetName = "windows-1252";
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ InputStreamReader isr = new InputStreamReader(stream, charsetName);
+ try (BufferedReader reader = new BufferedReader(isr)) {
+ String curLine = reader.readLine();
+ int mailItem = 0;
+ do {
+ if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ Metadata mailMetadata = new Metadata();
+ Queue<String> multiline = new LinkedList<String>();
+ mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ curLine = reader.readLine();
+
+ ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+ do {
+ if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
+
+ message.write(curLine.getBytes(charsetName));
+ message.write(0x0A);
+ curLine = reader.readLine();
+ }
+ while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+ for (String item : multiline) {
+ saveHeaderInMetadata(mailMetadata, item);
+ }
+
+ ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+ message = null;
+
+ if (extractor.shouldParseEmbedded(mailMetadata)) {
+ extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+ }
+
+ if (tracking) {
+ getTrackingMetadata().put(mailItem++, mailMetadata);
+ }
+ } else {
+ curLine = reader.readLine();
+ }
+
+ } while (curLine != null && !Thread.currentThread().isInterrupted());
+ }
+
+ xhtml.endDocument();
+ }
+
+ public boolean isTracking() {
+ return tracking;
+ }
+
+ public void setTracking(boolean tracking) {
+ this.tracking = tracking;
+ }
+
+ public Map<Integer, Metadata> getTrackingMetadata() {
+ return trackingMetadata;
+ }
+
+ private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ return; // ignore malformed header lines
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.set(TikaCoreProperties.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+ || headerTag.equalsIgnoreCase("Bcc")) {
+ Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+ if (address.find()) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+ } else if (headerContent.indexOf('@') > -1) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+ }
+
+ String property = Metadata.MESSAGE_TO;
+ if (headerTag.equalsIgnoreCase("Cc")) {
+ property = Metadata.MESSAGE_CC;
+ } else if (headerTag.equalsIgnoreCase("Bcc")) {
+ property = Metadata.MESSAGE_BCC;
+ }
+ metadata.add(property, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ try {
+ Date date = parseDate(headerContent);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.set(TikaCoreProperties.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.set(TikaCoreProperties.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+ private static final long serialVersionUID = 620998217748364063L;
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+ private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static AttributesImpl createAttribute(String attName, String attValue) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", attName, attName, "CDATA", attValue);
+ return attributes;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TikaInputStream in = TikaInputStream.get(stream);
+ PSTFile pstFile = null;
+ try {
+ pstFile = new PSTFile(in.getFile().getPath());
+ metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+ boolean isValid = pstFile.getFileHandle().getFD().valid();
+ metadata.set("isValid", valueOf(isValid));
+ if (isValid) {
+ parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+ }
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage(), e);
+ } finally {
+ if (pstFile != null && pstFile.getFileHandle() != null) {
+ try {
+ pstFile.getFileHandle().close();
+ } catch (IOException e) {
+ //swallow closing exception
+ }
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+ throws Exception {
+ if (pstFolder.getContentCount() > 0) {
+ PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+ while (pstMail != null) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ parserMailItem(handler, pstMail, embeddedExtractor);
+ parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+ handler.endElement("div");
+
+ pstMail = (PSTMessage) pstFolder.getNextChild();
+ }
+ }
+
+ if (pstFolder.hasSubfolders()) {
+ for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+ handler.startElement("div", createAttribute("class", "email-folder"));
+ handler.element("h1", pstSubFolder.getDisplayName());
+ parseFolder(handler, pstSubFolder, embeddedExtractor);
+ handler.endElement("div");
+ }
+ }
+ }
+
+ private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+ Metadata mailMetadata = new Metadata();
+ mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+ mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+ mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+ mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+ mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+ mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+ mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+ mailMetadata.set("recipients", pstMail.getRecipientsString());
+ mailMetadata.set("displayTo", pstMail.getDisplayTo());
+ mailMetadata.set("displayCC", pstMail.getDisplayCC());
+ mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+ mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+ mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+ mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+ }
+
+ private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+ throws TikaException {
+ int numberOfAttachments = email.getNumberOfAttachments();
+ for (int i = 0; i < numberOfAttachments; i++) {
+ File tempFile = null;
+ try {
+ PSTAttachment attach = email.getAttachment(i);
+
+ // Get the filename; both long and short filenames can be used for attachments
+ String filename = attach.getLongFilename();
+ if (filename.isEmpty()) {
+ filename = attach.getFilename();
+ }
+
+ xhtml.element("p", filename);
+
+ Metadata attachMeta = new Metadata();
+ attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+ attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", filename);
+ xhtml.startElement("div", attributes);
+ if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ xhtml.endElement("div");
+
+ } catch (Exception e) {
+ throw new TikaException("Unable to unpack document stream", e);
+ } finally {
+ if (tempFile != null)
+ tempFile.delete();
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector Wed Jan 6 03:50:50 2016
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.html.HtmlEncodingDetector
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.feed.FeedParser
+org.apache.tika.parser.html.HtmlParser
+org.apache.tika.parser.mail.RFC822Parser
+org.apache.tika.parser.mbox.MboxParser
+org.apache.tika.parser.mbox.OutlookPSTParser
+org.apache.tika.parser.iptc.IptcAnpaParser
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties Wed Jan 6 03:50:50 2016
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+aeDescriptorPath=/ctakes-clinical-pipeline/desc/analysis_engine/AggregatePlaintextUMLSProcessor.xml
+text=true
+annotationProps=BEGIN,END,ONTOLOGY_CONCEPT_ARR
+separatorChar=:
+metadata=Study Title,Study Description
+UMLSUser=
+UMLSPass=
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest {
+ @Test
+ public void testRSSParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/rsstest.rss")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample RSS File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO find a way of testing the paragraphs and anchors
+ }
+ }
+
+
+ @Test
+ public void testAtomParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/testATOM.atom")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample Atom File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO Check some more
+ }
+ }
+
+}