You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:09 UTC
[03/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
index 45f0388..da046aa 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.Locale;
-
-/**
- * Alternative HTML mapping rules that pass the input HTML as-is without any
- * modifications.
- *
- * @since Apache Tika 0.8
- */
-public class IdentityHtmlMapper implements HtmlMapper {
-
- public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
-
- public boolean isDiscardElement(String name) {
- return false;
- }
-
- public String mapSafeAttribute(String elementName, String attributeName) {
- return attributeName.toLowerCase(Locale.ENGLISH);
- }
-
- public String mapSafeElement(String name) {
- return name.toLowerCase(Locale.ENGLISH);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.Locale;
+
+/**
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
+ * @since Apache Tika 0.8
+ */
+public class IdentityHtmlMapper implements HtmlMapper {
+
+ public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
+
+ public boolean isDiscardElement(String name) {
+ return false;
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return attributeName.toLowerCase(Locale.ENGLISH);
+ }
+
+ public String mapSafeElement(String name) {
+ return name.toLowerCase(Locale.ENGLISH);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
index 336ae75..221a87a 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
@@ -1,78 +1,78 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import javax.xml.XMLConstants;
-import java.util.Locale;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that downgrades XHTML elements to
- * old-style HTML elements before passing them on to the decorated
- * content handler. This downgrading consists of dropping all namespaces
- * (and namespaced attributes) and uppercasing all element names.
- * Used by the {@link HtmlParser} to make all incoming HTML look the same.
- */
-class XHTMLDowngradeHandler extends ContentHandlerDecorator {
-
- public XHTMLDowngradeHandler(ContentHandler handler) {
- super(handler);
- }
-
- @Override
- public void startElement(
- String uri, String localName, String name, Attributes atts)
- throws SAXException {
- String upper = localName.toUpperCase(Locale.ENGLISH);
-
- AttributesImpl attributes = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- String auri = atts.getURI(i);
- String local = atts.getLocalName(i);
- String qname = atts.getQName(i);
- if (XMLConstants.NULL_NS_URI.equals(auri)
- && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
- && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
- attributes.addAttribute(
- auri, local, qname, atts.getType(i), atts.getValue(i));
- }
- }
-
- super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
- }
-
- @Override
- public void endElement(String uri, String localName, String name)
- throws SAXException {
- String upper = localName.toUpperCase(Locale.ENGLISH);
- super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+ public XHTMLDowngradeHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+
+ AttributesImpl attributes = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ String auri = atts.getURI(i);
+ String local = atts.getLocalName(i);
+ String qname = atts.getQName(i);
+ if (XMLConstants.NULL_NS_URI.equals(auri)
+ && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+ && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+ attributes.addAttribute(
+ auri, local, qname, atts.getType(i), atts.getValue(i));
+ }
+ }
+
+ super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+ super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 9740eff..2c8942e 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -1,376 +1,376 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.codec.DecodeMonitor;
-import org.apache.james.mime4j.codec.DecoderUtil;
-import org.apache.james.mime4j.dom.address.Address;
-import org.apache.james.mime4j.dom.address.AddressList;
-import org.apache.james.mime4j.dom.address.Mailbox;
-import org.apache.james.mime4j.dom.address.MailboxList;
-import org.apache.james.mime4j.dom.field.AddressListField;
-import org.apache.james.mime4j.dom.field.DateTimeField;
-import org.apache.james.mime4j.dom.field.MailboxListField;
-import org.apache.james.mime4j.dom.field.ParsedField;
-import org.apache.james.mime4j.dom.field.UnstructuredField;
-import org.apache.james.mime4j.field.LenientFieldParser;
-import org.apache.james.mime4j.parser.ContentHandler;
-import org.apache.james.mime4j.stream.BodyDescriptor;
-import org.apache.james.mime4j.stream.Field;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
-/**
- * Bridge between mime4j's content handler and the generic Sax content handler
- * used by Tika. See
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
- */
-class MailContentHandler implements ContentHandler {
-
- //TIKA-1970 Mac Mail's format
- private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
- Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
-
- //find a time ending in am/pm without a space: 10:30am and
- //use this pattern to insert space: 10:30 am
- private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
-
- private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
- //note that the string is "cleaned" before processing:
- //1) condense multiple whitespace to single space
- //2) trim()
- //3) strip out commas
- //4) insert space before am/pm
-
- //May 16 2016 1:32am
- createDateFormat("MMM dd yy hh:mm a", null),
-
- //this is a standard pattern handled by mime4j;
- //but mime4j fails with leading whitespace
- createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
-
- createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
-
- createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
-
- createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
-
- //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
- createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
-
- createDateFormat("yy-MM-dd HH:mm:ss", null),
-
- createDateFormat("MM/dd/yy hh:mm a", null, false),
-
- //now dates without times
- createDateFormat("MMM d yy", MIDDAY, false),
- createDateFormat("EEE d MMM yy", MIDDAY, false),
- createDateFormat("d MMM yy", MIDDAY, false),
- createDateFormat("yy/MM/dd", MIDDAY, false),
- createDateFormat("MM/dd/yy", MIDDAY, false)
- };
-
- private static DateFormat createDateFormat(String format, TimeZone timezone) {
- return createDateFormat(format, timezone, true);
- }
-
- private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
- SimpleDateFormat sdf =
- new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
- if (timezone != null) {
- sdf.setTimeZone(timezone);
- }
- sdf.setLenient(isLenient);
- return sdf;
- }
-
- private boolean strictParsing = false;
-
- private XHTMLContentHandler handler;
- private Metadata metadata;
- private EmbeddedDocumentExtractor extractor;
-
- private boolean inPart = false;
-
- MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
- this.handler = xhtml;
- this.metadata = metadata;
- this.strictParsing = strictParsing;
-
- // Fetch / Build an EmbeddedDocumentExtractor with which
- // to handle/process the parts/attachments
-
- // Was an EmbeddedDocumentExtractor explicitly supplied?
- this.extractor = context.get(EmbeddedDocumentExtractor.class);
-
- // If there's no EmbeddedDocumentExtractor, then try using a normal parser
- // This will ensure that the contents are made available to the user, so
- // the see the text, but without fine-grained control/extraction
- // (This also maintains backward compatibility with older versions!)
- if (this.extractor == null) {
- // If the user gave a parser, use that, if not the default
- Parser parser = context.get(AutoDetectParser.class);
- if (parser == null) {
- parser = context.get(Parser.class);
- }
- if (parser == null) {
- TikaConfig tikaConfig = context.get(TikaConfig.class);
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- parser = new AutoDetectParser(tikaConfig.getParser());
- }
- ParseContext ctx = new ParseContext();
- ctx.set(Parser.class, parser);
- extractor = new ParsingEmbeddedDocumentExtractor(ctx);
- }
- }
-
- public void body(BodyDescriptor body, InputStream is) throws MimeException,
- IOException {
- // use a different metadata object
- // in order to specify the mime type of the
- // sub part without damaging the main metadata
-
- Metadata submd = new Metadata();
- submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
- submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
-
- try {
- if (extractor.shouldParseEmbedded(submd)) {
- extractor.parseEmbedded(is, handler, submd, false);
- }
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endBodyPart() throws MimeException {
- try {
- handler.endElement("p");
- handler.endElement("div");
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endHeader() throws MimeException {
- }
-
- public void startMessage() throws MimeException {
- try {
- handler.startDocument();
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endMessage() throws MimeException {
- try {
- handler.endDocument();
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void endMultipart() throws MimeException {
- inPart = false;
- }
-
- public void epilogue(InputStream is) throws MimeException, IOException {
- }
-
- /**
- * Header for the whole message or its parts
- *
- * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
- * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
- * Field.html
- */
- public void field(Field field) throws MimeException {
- // inPart indicates whether these metadata correspond to the
- // whole message or its parts
- if (inPart) {
- return;
- }
-
- try {
- String fieldname = field.getName();
- ParsedField parsedField = LenientFieldParser.getParser().parse(
- field, DecodeMonitor.SILENT);
- if (fieldname.equalsIgnoreCase("From")) {
- MailboxListField fromField = (MailboxListField) parsedField;
- MailboxList mailboxList = fromField.getMailboxList();
- if (fromField.isValidField() && mailboxList != null) {
- for (Address address : mailboxList) {
- String from = getDisplayString(address);
- metadata.add(Metadata.MESSAGE_FROM, from);
- metadata.add(TikaCoreProperties.CREATOR, from);
- }
- } else {
- String from = stripOutFieldPrefix(field, "From:");
- if (from.startsWith("<")) {
- from = from.substring(1);
- }
- if (from.endsWith(">")) {
- from = from.substring(0, from.length() - 1);
- }
- metadata.add(Metadata.MESSAGE_FROM, from);
- metadata.add(TikaCoreProperties.CREATOR, from);
- }
- } else if (fieldname.equalsIgnoreCase("Subject")) {
- metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
- ((UnstructuredField) parsedField).getValue());
- } else if (fieldname.equalsIgnoreCase("To")) {
- processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
- } else if (fieldname.equalsIgnoreCase("CC")) {
- processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
- } else if (fieldname.equalsIgnoreCase("BCC")) {
- processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
- } else if (fieldname.equalsIgnoreCase("Date")) {
- DateTimeField dateField = (DateTimeField) parsedField;
- Date date = dateField.getDate();
- if (date == null) {
- date = tryOtherDateFormats(field.getBody());
- }
- metadata.set(TikaCoreProperties.CREATED, date);
- }
- } catch (RuntimeException me) {
- if (strictParsing) {
- throw me;
- }
- }
- }
-
- private static synchronized Date tryOtherDateFormats(String text) {
- if (text == null) {
- return null;
- }
- //strip out additional spaces and trim
- text = text.replaceAll("\\s+", " ").trim();
-
- //strip out commas
- text = text.replaceAll(",", "");
- Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("GMT$1$2:00");
- }
-
- matcher = AM_PM.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("$1 $2");
- }
-
- for (DateFormat format : ALTERNATE_DATE_FORMATS) {
- try {
- return format.parse(text);
- } catch (ParseException e) {
- }
- }
- return null;
- }
-
- private void processAddressList(ParsedField field, String addressListType,
- String metadataField) throws MimeException {
- AddressListField toField = (AddressListField) field;
- if (toField.isValidField()) {
- AddressList addressList = toField.getAddressList();
- for (Address address : addressList) {
- metadata.add(metadataField, getDisplayString(address));
- }
- } else {
- String to = stripOutFieldPrefix(field,
- addressListType);
- for (String eachTo : to.split(",")) {
- metadata.add(metadataField, eachTo.trim());
- }
- }
- }
-
- private String getDisplayString(Address address) {
- if (address instanceof Mailbox) {
- Mailbox mailbox = (Mailbox) address;
- String name = mailbox.getName();
- if (name != null && name.length() > 0) {
- name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
- return name + " <" + mailbox.getAddress() + ">";
- } else {
- return mailbox.getAddress();
- }
- } else {
- return address.toString();
- }
- }
-
- public void preamble(InputStream is) throws MimeException, IOException {
- }
-
- public void raw(InputStream is) throws MimeException, IOException {
- }
-
- public void startBodyPart() throws MimeException {
- try {
- handler.startElement("div", "class", "email-entry");
- handler.startElement("p");
- } catch (SAXException e) {
- throw new MimeException(e);
- }
- }
-
- public void startHeader() throws MimeException {
- // TODO Auto-generated method stub
-
- }
-
- public void startMultipart(BodyDescriptor descr) throws MimeException {
- inPart = true;
- }
-
- private String stripOutFieldPrefix(Field field, String fieldname) {
- String temp = field.getRaw().toString();
- int loc = fieldname.length();
- while (temp.charAt(loc) == ' ') {
- loc++;
- }
- return temp.substring(loc);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.parser.ContentHandler;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
+ */
+class MailContentHandler implements ContentHandler {
+
+ //TIKA-1970 Mac Mail's format
+ private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
+ Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+
+ //find a time ending in am/pm without a space: 10:30am and
+ //use this pattern to insert space: 10:30 am
+ private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
+ private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
+ //note that the string is "cleaned" before processing:
+ //1) condense multiple whitespace to single space
+ //2) trim()
+ //3) strip out commas
+ //4) insert space before am/pm
+
+ //May 16 2016 1:32am
+ createDateFormat("MMM dd yy hh:mm a", null),
+
+ //this is a standard pattern handled by mime4j;
+ //but mime4j fails with leading whitespace
+ createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+
+ createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+
+ createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+
+ createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
+
+ //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
+ createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
+
+ createDateFormat("yy-MM-dd HH:mm:ss", null),
+
+ createDateFormat("MM/dd/yy hh:mm a", null, false),
+
+ //now dates without times
+ createDateFormat("MMM d yy", MIDDAY, false),
+ createDateFormat("EEE d MMM yy", MIDDAY, false),
+ createDateFormat("d MMM yy", MIDDAY, false),
+ createDateFormat("yy/MM/dd", MIDDAY, false),
+ createDateFormat("MM/dd/yy", MIDDAY, false)
+ };
+
+ private static DateFormat createDateFormat(String format, TimeZone timezone) {
+ return createDateFormat(format, timezone, true);
+ }
+
+ private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
+ SimpleDateFormat sdf =
+ new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+ if (timezone != null) {
+ sdf.setTimeZone(timezone);
+ }
+ sdf.setLenient(isLenient);
+ return sdf;
+ }
+
+ private boolean strictParsing = false;
+
+ private XHTMLContentHandler handler;
+ private Metadata metadata;
+ private EmbeddedDocumentExtractor extractor;
+
+ private boolean inPart = false;
+
+ MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
+ this.handler = xhtml;
+ this.metadata = metadata;
+ this.strictParsing = strictParsing;
+
+ // Fetch / Build an EmbeddedDocumentExtractor with which
+ // to handle/process the parts/attachments
+
+ // Was an EmbeddedDocumentExtractor explicitly supplied?
+ this.extractor = context.get(EmbeddedDocumentExtractor.class);
+
+ // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+ // This will ensure that the contents are made available to the user, so
+ // the see the text, but without fine-grained control/extraction
+ // (This also maintains backward compatibility with older versions!)
+ if (this.extractor == null) {
+ // If the user gave a parser, use that, if not the default
+ Parser parser = context.get(AutoDetectParser.class);
+ if (parser == null) {
+ parser = context.get(Parser.class);
+ }
+ if (parser == null) {
+ TikaConfig tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ parser = new AutoDetectParser(tikaConfig.getParser());
+ }
+ ParseContext ctx = new ParseContext();
+ ctx.set(Parser.class, parser);
+ extractor = new ParsingEmbeddedDocumentExtractor(ctx);
+ }
+ }
+
+ public void body(BodyDescriptor body, InputStream is) throws MimeException,
+ IOException {
+ // use a different metadata object
+ // in order to specify the mime type of the
+ // sub part without damaging the main metadata
+
+ Metadata submd = new Metadata();
+ submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+ submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+ try {
+ if (extractor.shouldParseEmbedded(submd)) {
+ extractor.parseEmbedded(is, handler, submd, false);
+ }
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endBodyPart() throws MimeException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endHeader() throws MimeException {
+ }
+
+ public void startMessage() throws MimeException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMessage() throws MimeException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void endMultipart() throws MimeException {
+ inPart = false;
+ }
+
+ public void epilogue(InputStream is) throws MimeException, IOException {
+ }
+
+ /**
+ * Header for the whole message or its parts
+ *
+ * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
+ * Field.html
+ */
+ public void field(Field field) throws MimeException {
+ // inPart indicates whether these metadata correspond to the
+ // whole message or its parts
+ if (inPart) {
+ return;
+ }
+
+ try {
+ String fieldname = field.getName();
+ ParsedField parsedField = LenientFieldParser.getParser().parse(
+ field, DecodeMonitor.SILENT);
+ if (fieldname.equalsIgnoreCase("From")) {
+ MailboxListField fromField = (MailboxListField) parsedField;
+ MailboxList mailboxList = fromField.getMailboxList();
+ if (fromField.isValidField() && mailboxList != null) {
+ for (Address address : mailboxList) {
+ String from = getDisplayString(address);
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else {
+ String from = stripOutFieldPrefix(field, "From:");
+ if (from.startsWith("<")) {
+ from = from.substring(1);
+ }
+ if (from.endsWith(">")) {
+ from = from.substring(0, from.length() - 1);
+ }
+ metadata.add(Metadata.MESSAGE_FROM, from);
+ metadata.add(TikaCoreProperties.CREATOR, from);
+ }
+ } else if (fieldname.equalsIgnoreCase("Subject")) {
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
+ ((UnstructuredField) parsedField).getValue());
+ } else if (fieldname.equalsIgnoreCase("To")) {
+ processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+ } else if (fieldname.equalsIgnoreCase("CC")) {
+ processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+ } else if (fieldname.equalsIgnoreCase("BCC")) {
+ processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+ } else if (fieldname.equalsIgnoreCase("Date")) {
+ DateTimeField dateField = (DateTimeField) parsedField;
+ Date date = dateField.getDate();
+ if (date == null) {
+ date = tryOtherDateFormats(field.getBody());
+ }
+ metadata.set(TikaCoreProperties.CREATED, date);
+ }
+ } catch (RuntimeException me) {
+ if (strictParsing) {
+ throw me;
+ }
+ }
+ }
+
+ private static synchronized Date tryOtherDateFormats(String text) {
+ if (text == null) {
+ return null;
+ }
+ //strip out additional spaces and trim
+ text = text.replaceAll("\\s+", " ").trim();
+
+ //strip out commas
+ text = text.replaceAll(",", "");
+ Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceFirst("GMT$1$2:00");
+ }
+
+ matcher = AM_PM.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceFirst("$1 $2");
+ }
+
+ for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+ try {
+ return format.parse(text);
+ } catch (ParseException e) {
+ }
+ }
+ return null;
+ }
+
+ private void processAddressList(ParsedField field, String addressListType,
+ String metadataField) throws MimeException {
+ AddressListField toField = (AddressListField) field;
+ if (toField.isValidField()) {
+ AddressList addressList = toField.getAddressList();
+ for (Address address : addressList) {
+ metadata.add(metadataField, getDisplayString(address));
+ }
+ } else {
+ String to = stripOutFieldPrefix(field,
+ addressListType);
+ for (String eachTo : to.split(",")) {
+ metadata.add(metadataField, eachTo.trim());
+ }
+ }
+ }
+
+ private String getDisplayString(Address address) {
+ if (address instanceof Mailbox) {
+ Mailbox mailbox = (Mailbox) address;
+ String name = mailbox.getName();
+ if (name != null && name.length() > 0) {
+ name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
+ return name + " <" + mailbox.getAddress() + ">";
+ } else {
+ return mailbox.getAddress();
+ }
+ } else {
+ return address.toString();
+ }
+ }
+
+ public void preamble(InputStream is) throws MimeException, IOException {
+ }
+
+ public void raw(InputStream is) throws MimeException, IOException {
+ }
+
+ public void startBodyPart() throws MimeException {
+ try {
+ handler.startElement("div", "class", "email-entry");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new MimeException(e);
+ }
+ }
+
+ public void startHeader() throws MimeException {
+ // TODO Auto-generated method stub
+
+ }
+
+ public void startMultipart(BodyDescriptor descr) throws MimeException {
+ inPart = true;
+ }
+
+ private String stripOutFieldPrefix(Field field, String fieldname) {
+ String temp = field.getRaw().toString();
+ int loc = fieldname.length();
+ while (temp.charAt(loc) == ' ') {
+ loc++;
+ }
+ return temp.substring(loc);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index 9ac02a7..6299d3f 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -1,95 +1,95 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.james.mime4j.MimeException;
-import org.apache.james.mime4j.parser.MimeStreamParser;
-import org.apache.james.mime4j.stream.MimeConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Uses apache-mime4j to parse emails. Each part is treated with the
- * corresponding parser and displayed within elements.
- * <p/>
- * A {@link MimeEntityConfig} object can be passed in the parsing context
- * to better control the parsing process.
- *
- * @author jnioche@digitalpebble.com
- */
-public class RFC822Parser extends AbstractParser {
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -5504243905998074168L;
-
- private static final Set<MediaType> SUPPORTED_TYPES = Collections
- .singleton(MediaType.parse("message/rfc822"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- // Get the mime4j configuration, or use a default one
- MimeConfig config = new MimeConfig();
- config.setMaxLineLen(100000);
- config.setMaxHeaderLen(100000); // max length of any individual header
- config = context.get(MimeConfig.class, config);
-
- MimeStreamParser parser = new MimeStreamParser(config);
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
- MailContentHandler mch = new MailContentHandler(
- xhtml, metadata, context, config.isStrictParsing());
- parser.setContentHandler(mch);
- parser.setContentDecoding(true);
-
- TikaInputStream tstream = TikaInputStream.get(stream);
- try {
- parser.parse(tstream);
- } catch (IOException e) {
- tstream.throwIfCauseOf(e);
- throw new TikaException("Failed to parse an email message", e);
- } catch (MimeException e) {
- // Unwrap the exception in case it was not thrown by mime4j
- Throwable cause = e.getCause();
- if (cause instanceof TikaException) {
- throw (TikaException) cause;
- } else if (cause instanceof SAXException) {
- throw (SAXException) cause;
- } else {
- throw new TikaException("Failed to parse an email message", e);
- }
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within elements.
+ * <p/>
+ * A {@link MimeEntityConfig} object can be passed in the parsing context
+ * to better control the parsing process.
+ *
+ * @author jnioche@digitalpebble.com
+ */
+public class RFC822Parser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -5504243905998074168L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.parse("message/rfc822"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ // Get the mime4j configuration, or use a default one
+ MimeConfig config = new MimeConfig();
+ config.setMaxLineLen(100000);
+ config.setMaxHeaderLen(100000); // max length of any individual header
+ config = context.get(MimeConfig.class, config);
+
+ MimeStreamParser parser = new MimeStreamParser(config);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ MailContentHandler mch = new MailContentHandler(
+ xhtml, metadata, context, config.isStrictParsing());
+ parser.setContentHandler(mch);
+ parser.setContentDecoding(true);
+
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ try {
+ parser.parse(tstream);
+ } catch (IOException e) {
+ tstream.throwIfCauseOf(e);
+ throw new TikaException("Failed to parse an email message", e);
+ } catch (MimeException e) {
+ // Unwrap the exception in case it was not thrown by mime4j
+ Throwable cause = e.getCause();
+ if (cause instanceof TikaException) {
+ throw (TikaException) cause;
+ } else if (cause instanceof SAXException) {
+ throw (SAXException) cause;
+ } else {
+ throw new TikaException("Failed to parse an email message", e);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
index 5be4b0b..cc10dd2 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
@@ -1,75 +1,75 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.feed;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class FeedParserTest {
- @Test
- public void testRSSParser() throws Exception {
- try (InputStream input = FeedParserTest.class.getResourceAsStream(
- "/test-documents/rsstest.rss")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
-
- new FeedParser().parse(input, handler, metadata, context);
-
- String content = handler.toString();
- assertFalse(content == null);
-
- assertEquals("Sample RSS File for Junit test",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
-
- // TODO find a way of testing the paragraphs and anchors
- }
- }
-
-
- @Test
- public void testAtomParser() throws Exception {
- try (InputStream input = FeedParserTest.class.getResourceAsStream(
- "/test-documents/testATOM.atom")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
-
- new FeedParser().parse(input, handler, metadata, context);
-
- String content = handler.toString();
- assertFalse(content == null);
-
- assertEquals("Sample Atom File for Junit test",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
-
- // TODO Check some more
- }
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest {
+ @Test
+ public void testRSSParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/rsstest.rss")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample RSS File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO find a way of testing the paragraphs and anchors
+ }
+ }
+
+
+ @Test
+ public void testAtomParser() throws Exception {
+ try (InputStream input = FeedParserTest.class.getResourceAsStream(
+ "/test-documents/testATOM.atom")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ new FeedParser().parse(input, handler, metadata, context);
+
+ String content = handler.toString();
+ assertFalse(content == null);
+
+ assertEquals("Sample Atom File for Junit test",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE));
+
+ // TODO Check some more
+ }
+ }
+
+}