You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/05/29 16:36:24 UTC
svn commit: r1682489 [11/14] - in /tika/trunk:
tika-parsers/src/main/java/org/apache/tika/parser/html/
tika-parsers/src/main/java/org/apache/tika/parser/image/
tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/
tika-parsers/src/main/java/org/...
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Fri May 29 14:36:21 2015
@@ -33,15 +33,15 @@ import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;
public class JpegParserTest {
-
+
private final Parser parser = new JpegParser();
-
+
@Test
public void testJPEG() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
// Core EXIF/TIFF tags
@@ -49,7 +49,7 @@ public class JpegParserTest {
assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
+
assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
@@ -62,24 +62,24 @@ public class JpegParserTest {
assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
+
// Check that EXIF/TIFF tags come through with their raw values too
// (This may be removed for Tika 1.0, as we support more of them
// with explicit Metadata entries)
assertEquals("Canon EOS 40D", metadata.get("Model"));
-
+
// Common tags
//assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
"2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
- assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
+ assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
- assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
+ assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
}
@@ -92,19 +92,19 @@ public class JpegParserTest {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
+
// Geo tags
assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
-
+
// Core EXIF/TIFF tags
assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
+
assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
@@ -117,7 +117,7 @@ public class JpegParserTest {
assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
-
+
// Common tags
assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
"2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
@@ -131,48 +131,48 @@ public class JpegParserTest {
/**
* Test for an image with the geographic information stored in a slightly
- * different way, see TIKA-915 for details
+ * different way, see TIKA-915 for details
* Disabled for now, pending a fix to the underlying library
*/
@Test
public void testJPEGGeo2() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- // Geo tags should be there with 5dp, and not rounded
- assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
- assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // Geo tags should be there with 5dp, and not rounded
+ assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
+ assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
}
-
+
@Test
public void testJPEGTitleAndDescription() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
+
// embedded comments with non-ascii characters
assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
// xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
// but we have to replace them with underscore
-
+
List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
assertTrue(keywords.contains("coast"));
assertTrue(keywords.contains("bird watching"));
assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
-
+
// Core EXIF/TIFF tags
assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-
+
assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
@@ -183,35 +183,35 @@ public class JpegParserTest {
assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
}
-
+
@Test
public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
+
// embedded comments with non-ascii characters
assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue("got " + keywords, keywords.contains("bird watching"));
+ assertTrue("got " + keywords, keywords.contains("bird watching"));
List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
- assertTrue("got " + subject, subject.contains("bird watching"));
+ assertTrue("got " + subject, subject.contains("bird watching"));
}
-
+
@Test
public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
+
// XnViewMp's default comment dialog has only comment, not headline.
// Comment is embedded only if "Write comments in XMP" is enabled in settings
assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
@@ -220,31 +220,31 @@ public class JpegParserTest {
String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
List<String> keywords = Arrays.asList(subject);
assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
- assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
+ assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
}
-
+
@Test
public void testJPEGoddTagComponent() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
-
- assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
- assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
}
-
+
@Test
public void testJPEGEmptyEXIFDateTime() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
- getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
- }
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Fri May 29 14:36:21 2015
@@ -54,6 +54,13 @@ import org.xml.sax.helpers.DefaultHandle
public class RFC822ParserTest extends TikaTest {
+ private static InputStream getStream(String name) {
+ InputStream stream = Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ assertNotNull("Test file not found " + name, stream);
+ return stream;
+ }
+
@Test
public void testSimple() {
Parser parser = new RFC822Parser();
@@ -73,9 +80,9 @@ public class RFC822ParserTest extends Ti
verify(handler).endDocument();
//note no leading spaces, and no quotes
assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+ assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(TikaCoreProperties.TITLE));
- assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
+ assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(Metadata.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
@@ -103,11 +110,11 @@ public class RFC822ParserTest extends Ti
verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
verify(handler).endDocument();
-
+
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
-
+
//repeat, this time looking at content
parser = new RFC822Parser();
metadata = new Metadata();
@@ -172,35 +179,35 @@ public class RFC822ParserTest extends Ti
parser.parse(stream, handler, metadata, new ParseContext());
//tests correct decoding of internationalized headers, both
//quoted-printable (Q) and Base64 (B).
- assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
+ assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>",
metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("If you can read this you understand the example.",
+ assertEquals("If you can read this you understand the example.",
metadata.get(TikaCoreProperties.TITLE));
- assertEquals("If you can read this you understand the example.",
+ assertEquals("If you can read this you understand the example.",
metadata.get(Metadata.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
-
+
/**
* The from isn't in the usual form.
* See TIKA-618
*/
@Test
public void testUnusualFromAddress() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822_oddfrom");
- ContentHandler handler = mock(DefaultHandler.class);
-
- parser.parse(stream, handler, metadata, new ParseContext());
- assertEquals("Saved by Windows Internet Explorer 7",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Air Permit Programs | Air & Radiation | US EPA",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Air Permit Programs | Air & Radiation | US EPA",
- metadata.get(Metadata.SUBJECT));
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_oddfrom");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals("Saved by Windows Internet Explorer 7",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Air Permit Programs | Air & Radiation | US EPA",
+ metadata.get(Metadata.SUBJECT));
}
/**
@@ -236,36 +243,36 @@ public class RFC822ParserTest extends Ti
new ByteArrayInputStream(data), handler, metadata, context);
assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
}
-
+
/**
* Test for TIKA-678 - not all headers may be present
*/
@Test
public void testSomeMissingHeaders() throws Exception {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
- ContentHandler handler = new BodyContentHandler();
-
- parser.parse(stream, handler, metadata, new ParseContext());
- assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
- assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
- assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
- assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
- assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
- assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
- assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
- assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
- assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
- assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("abcd", metadata.get(Metadata.SUBJECT));
- assertContains("bar biz bat", handler.toString());
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
+ ContentHandler handler = new BodyContentHandler();
+
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
+ assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
+ assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
+ assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
+ assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
+ assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
+ assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
+ assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
+ assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
+ assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("abcd", metadata.get(Metadata.SUBJECT));
+ assertContains("bar biz bat", handler.toString());
}
-
+
/**
* Test TIKA-1028 - If the mail contains an encrypted attachment (or
- * an attachment that others triggers an error), parsing should carry
- * on for the remainder regardless
+ * an attachment that others triggers an error), parsing should carry
+ * on for the remainder regardless
*/
@Test
public void testEncryptedZipAttachment() throws Exception {
@@ -275,40 +282,40 @@ public class RFC822ParserTest extends Ti
InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
-
+
// Check we go the metadata
assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
+
// Check we got the message text, for both Plain Text and HTML
assertContains("Includes encrypted zip file", handler.toString());
assertContains("password is \"test\".", handler.toString());
assertContains("This is the Plain Text part", handler.toString());
assertContains("This is the HTML part", handler.toString());
-
+
// We won't get the contents of the zip file, but we will get the name
assertContains("text.txt", handler.toString());
assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
-
+
// Try again, this time with the password supplied
// Check that we also get the zip's contents as well
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "test";
}
- });
+ });
stream = getStream("test-documents/testRFC822_encrypted_zip");
handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
-
+
assertContains("Includes encrypted zip file", handler.toString());
assertContains("password is \"test\".", handler.toString());
assertContains("This is the Plain Text part", handler.toString());
assertContains("This is the HTML part", handler.toString());
-
+
// We do get the name of the file in the encrypted zip file
assertContains("text.txt", handler.toString());
-
+
// TODO Upgrade to a version of Commons Compress with Encryption
// support, then verify we get the contents of the text file
// held within the encrypted zip
@@ -317,10 +324,10 @@ public class RFC822ParserTest extends Ti
assertContains("ENCRYPTED ZIP FILES", handler.toString());
assertContains("TIKA-1028", handler.toString());
}
-
+
/**
* Test TIKA-1028 - Ensure we can get the contents of an
- * un-encrypted zip file
+ * un-encrypted zip file
*/
@Test
public void testNormalZipAttachment() throws Exception {
@@ -330,26 +337,26 @@ public class RFC822ParserTest extends Ti
InputStream stream = getStream("test-documents/testRFC822_normal_zip");
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
-
+
// Check we go the metadata
assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
-
+
// Check we got the message text, for both Plain Text and HTML
assertContains("Includes a normal, unencrypted zip file", handler.toString());
assertContains("This is the Plain Text part", handler.toString());
assertContains("This is the HTML part", handler.toString());
-
+
// We get both name and contents of the zip file's contents
assertContains("text.txt", handler.toString());
assertContains("TEST DATA FOR TIKA.", handler.toString());
assertContains("This is text inside an unencrypted zip file", handler.toString());
assertContains("TIKA-1028", handler.toString());
}
-
+
/**
* TIKA-1222 When requested, ensure that the various attachments of
- * the mail come through properly as embedded resources
+ * the mail come through properly as embedded resources
*/
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
@@ -364,11 +371,11 @@ public class RFC822ParserTest extends Ti
if (tis != null)
tis.close();
}
-
+
// Check we found all 3 parts
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
-
+
// No filenames available
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.filenames.get(1));
@@ -378,11 +385,4 @@ public class RFC822ParserTest extends Ti
assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
-
- private static InputStream getStream(String name) {
- InputStream stream = Thread.currentThread().getContextClassLoader()
- .getResourceAsStream(name);
- assertNotNull("Test file not found " + name, stream);
- return stream;
- }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Fri May 29 14:36:21 2015
@@ -35,137 +35,137 @@ import org.xml.sax.ContentHandler;
public class MboxParserTest {
- protected ParseContext recursingContext;
- private Parser autoDetectParser;
- private TypeDetector typeDetector;
- private MboxParser mboxParser;
-
- @Before
- public void setUp() throws Exception {
- typeDetector = new TypeDetector();
- autoDetectParser = new AutoDetectParser(typeDetector);
- recursingContext = new ParseContext();
- recursingContext.set(Parser.class, autoDetectParser);
-
- mboxParser = new MboxParser();
- mboxParser.setTracking(true);
- }
-
- @Test
- public void testSimple() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("/test-documents/simple.mbox");
-
- try {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- String content = handler.toString();
- assertContains("Test content 1", content);
- assertContains("Test content 2", content);
- assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
-
- Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
- assertEquals("Nb. Of mails", 2, mailsMetadata.size());
-
- Metadata mail1 = mailsMetadata.get(0);
- assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
- assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
-
- Metadata mail2 = mailsMetadata.get(1);
- assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
- assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
- }
-
- @Test
- public void testHeaders() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("/test-documents/headers.mbox");
-
- try {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- assertContains("Test content", handler.toString());
- assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
- Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
-
- assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
- assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
- assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
- assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
- assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals("author@domain.com", mailMetadata.get("Message-From"));
- assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
- }
-
- @Test
- public void testMultilineHeader() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("/test-documents/multiline.mbox");
-
- try {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
-
- Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
- assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
- }
-
- @Test
- public void testQuoted() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("/test-documents/quoted.mbox");
-
- try {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- assertContains("Test content", handler.toString());
- assertContains("> quoted stuff", handler.toString());
- }
-
- @Test
- public void testComplex() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("/test-documents/complex.mbox");
-
- try {
- mboxParser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
-
- Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
- assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
- assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
- assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
- assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
- assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
-
- assertContains("When a Mapper completes", handler.toString());
- }
-
- private static InputStream getStream(String name) {
- return MboxParserTest.class.getClass().getResourceAsStream(name);
- }
+ protected ParseContext recursingContext;
+ private Parser autoDetectParser;
+ private TypeDetector typeDetector;
+ private MboxParser mboxParser;
+
+ private static InputStream getStream(String name) {
+ return MboxParserTest.class.getClass().getResourceAsStream(name);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ typeDetector = new TypeDetector();
+ autoDetectParser = new AutoDetectParser(typeDetector);
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+
+ mboxParser = new MboxParser();
+ mboxParser.setTracking(true);
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("/test-documents/simple.mbox");
+
+ try {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ String content = handler.toString();
+ assertContains("Test content 1", content);
+ assertContains("Test content 2", content);
+ assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+ Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
+ assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+ Metadata mail1 = mailsMetadata.get(0);
+ assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
+
+ Metadata mail2 = mailsMetadata.get(1);
+ assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
+ }
+
+ @Test
+ public void testHeaders() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("/test-documents/headers.mbox");
+
+ try {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertContains("Test content", handler.toString());
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+ assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
+ assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+ assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
+ assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("author@domain.com", mailMetadata.get("Message-From"));
+ assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
+ }
+
+ @Test
+ public void testMultilineHeader() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("/test-documents/multiline.mbox");
+
+ try {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
+ }
+
+ @Test
+ public void testQuoted() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("/test-documents/quoted.mbox");
+
+ try {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertContains("Test content", handler.toString());
+ assertContains("> quoted stuff", handler.toString());
+ }
+
+ @Test
+ public void testComplex() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("/test-documents/complex.mbox");
+
+ try {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
+
+ Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
+ assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+ assertContains("When a Mapper completes", handler.toString());
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri May 29 14:36:21 2015
@@ -28,7 +28,7 @@ import org.apache.tika.mime.MediaType;
/**
* Parent class of tests that the various POI powered parsers are
- * able to extract their embedded contents.
+ * able to extract their embedded contents.
*/
public abstract class AbstractPOIContainerExtractionTest {
public static final MediaType TYPE_DOC = MediaType.application("msword");
@@ -38,16 +38,24 @@ public abstract class AbstractPOIContain
public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
-
+
public static final MediaType TYPE_TXT = MediaType.text("plain");
public static final MediaType TYPE_PDF = MediaType.application("pdf");
-
+
public static final MediaType TYPE_JPG = MediaType.image("jpeg");
public static final MediaType TYPE_GIF = MediaType.image("gif");
public static final MediaType TYPE_PNG = MediaType.image("png");
public static final MediaType TYPE_EMF = MediaType.application("x-emf");
public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
+ protected static TikaInputStream getTestFile(String filename) throws Exception {
+ URL input = AbstractPOIContainerExtractionTest.class.getResource(
+ "/test-documents/" + filename);
+ assertNotNull(filename + " not found", input);
+
+ return TikaInputStream.get(input);
+ }
+
protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
TikaInputStream stream = getTestFile(filename);
try {
@@ -55,7 +63,7 @@ public abstract class AbstractPOIContain
// Process it
TrackingHandler handler = new TrackingHandler();
- if(recurse) {
+ if (recurse) {
extractor.extract(stream, extractor, handler);
} else {
extractor.extract(stream, null, handler);
@@ -67,12 +75,4 @@ public abstract class AbstractPOIContain
stream.close();
}
}
-
- protected static TikaInputStream getTestFile(String filename) throws Exception {
- URL input = AbstractPOIContainerExtractionTest.class.getResource(
- "/test-documents/" + filename);
- assertNotNull(filename + " not found", input);
-
- return TikaInputStream.get(input);
- }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -60,15 +60,15 @@ public class ExcelParserTest {
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-
+
// Mon Oct 01 17:13:56 BST 2007
assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
-
+
// Mon Oct 01 17:31:43 BST 2007
assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
-
+
String content = handler.toString();
assertContains("Sample Excel Worksheet", content);
assertContains("Numbers and their Squares", content);
@@ -115,7 +115,7 @@ public class ExcelParserTest {
// Percentage.
assertContains("2.50%", content);
// Excel rounds up to 3%, but that requires Java 1.6 or later
- if(System.getProperty("java.version").startsWith("1.5")) {
+ if (System.getProperty("java.version").startsWith("1.5")) {
assertContains("2%", content);
} else {
assertContains("3%", content);
@@ -130,31 +130,31 @@ public class ExcelParserTest {
// Date Format: m/d/yy
assertContains("10/3/09", content);
-
+
// Date/Time Format: m/d/yy h:mm
assertContains("1/19/08 4:35", content);
// Fraction (2.5): # ?/?
assertContains("2 1/2", content);
-
+
// Below assertions represent outstanding formatting issues to be addressed
// they are included to allow the issues to be progressed with the Apache POI
// team - See TIKA-103.
/*************************************************************************
- // Custom Number (0 "dollars and" .00 "cents")
- assertContains("19 dollars and .99 cents", content);
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertContains("19 dollars and .99 cents", content);
- // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
- assertContains("At 4:20 AM on Thursday May 17, 2007", content);
- **************************************************************************/
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+ **************************************************************************/
} finally {
input.close();
}
}
-
+
@Test
public void testExcelParserPassword() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
@@ -191,11 +191,11 @@ public class ExcelParserTest {
assertEquals(
"application/vnd.ms-excel",
metadata.get(Metadata.CONTENT_TYPE));
-
+
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED));
-
+
String content = handler.toString();
assertContains("This is an Encrypted Excel spreadsheet", content);
assertNotContained("9.0", content);
@@ -210,24 +210,24 @@ public class ExcelParserTest {
@Test
public void testExcelParserCharts() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL-charts.xls");
+ "/test-documents/testEXCEL-charts.xls");
try {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
ContentHandler handler = new BodyContentHandler();
new OfficeParser().parse(input, handler, metadata, context);
-
+
assertEquals(
"application/vnd.ms-excel",
metadata.get(Metadata.CONTENT_TYPE));
-
+
String content = handler.toString();
-
+
// The first sheet has a pie chart
assertContains("charttabyodawg", content);
assertContains("WhamPuff", content);
-
+
// The second sheet has a bar chart and some text
assertContains("Sheet1", content);
assertContains("Test Excel Spreasheet", content);
@@ -236,7 +236,7 @@ public class ExcelParserTest {
assertContains("fizzlepuff", content);
assertContains("whyaxis", content);
assertContains("eksaxis", content);
-
+
// The third sheet has some text
assertContains("Sheet2", content);
assertContains("dingdong", content);
@@ -265,7 +265,7 @@ public class ExcelParserTest {
input.close();
}
}
-
+
@Test
public void testWorksSpreadsheet70() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
@@ -291,43 +291,43 @@ public class ExcelParserTest {
*/
@Test
public void testExcelXLSB() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
-
- InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xlsb");
- Metadata m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
- // Should be detected correctly
- MediaType type = null;
- try {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
- } finally {
- input.close();
- }
-
- // OfficeParser won't handle it
- assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // AutoDetectParser doesn't break on it
- input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");
-
- try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
- assertEquals("", content);
- } finally {
- input.close();
- }
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL.xlsb");
+ Metadata m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+ // Should be detected correctly
+ MediaType type = null;
+ try {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+ } finally {
+ input.close();
+ }
+
+ // OfficeParser won't handle it
+ assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // AutoDetectParser doesn't break on it
+ input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+ assertEquals("", content);
+ } finally {
+ input.close();
+ }
}
/**
@@ -335,32 +335,32 @@ public class ExcelParserTest {
*/
@Test
public void testExcel95() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
- InputStream input;
- MediaType type;
- Metadata m;
-
- // First try detection of Excel 5
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
- input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls");
- try {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- } finally {
- input.close();
- }
-
- // Now Excel 95
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
- input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
- try {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+ InputStream input;
+ MediaType type;
+ Metadata m;
+
+ // First try detection of Excel 5
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+ input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls");
+ try {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ } finally {
+ input.close();
+ }
+
+ // Now Excel 95
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+ input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
+ try {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
} finally {
- input.close();
+ input.close();
}
// OfficeParser can handle it
@@ -368,8 +368,8 @@ public class ExcelParserTest {
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-
+
+
// Parse the Excel 5 file
m = new Metadata();
input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls");
@@ -380,26 +380,26 @@ public class ExcelParserTest {
parser.parse(input, handler, m, context);
String content = handler.toString();
-
+
// Sheet names
assertContains("Feuil1", content);
assertContains("Feuil3", content);
-
+
// Text
assertContains("Sample Excel", content);
assertContains("Number", content);
-
+
// Numbers
assertContains("15", content);
assertContains("225", content);
-
+
// Metadata was also fetched
assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
} finally {
input.close();
}
-
+
// Parse the Excel 95 file
m = new Metadata();
input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
@@ -410,12 +410,12 @@ public class ExcelParserTest {
parser.parse(input, handler, m, context);
String content = handler.toString();
-
+
// Sheet name
assertContains("Foglio1", content);
-
+
// Very boring file, no actual text or numbers!
-
+
// Metadata was also fetched
assertEquals(null, m.get(TikaCoreProperties.TITLE));
assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
@@ -423,35 +423,35 @@ public class ExcelParserTest {
input.close();
}
}
-
+
/**
* Ensures that custom OLE2 (HPSF) properties are extracted
*/
@Test
public void testCustomProperties() throws Exception {
- InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_custom_props.xls");
- Metadata metadata = new Metadata();
-
- try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
- } finally {
- input.close();
- }
-
- assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL_custom_props.xls");
+ Metadata metadata = new Metadata();
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OfficeParser().parse(input, handler, metadata, context);
+ } finally {
+ input.close();
+ }
+
+ assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java Fri May 29 14:36:21 2015
@@ -28,20 +28,19 @@ import org.apache.tika.parser.microsoft.
import org.junit.Test;
-
public class OfficeParserTest extends TikaTest {
- @Test
- public void parseOfficeWord() throws Exception {
- Metadata metadata = new Metadata();
- Parser parser = new OfficeParser();
-
- String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
-
- assertTrue(xml.contains("test"));
- }
-
- private InputStream getTestDocument(String name) {
- return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
-}
+ @Test
+ public void parseOfficeWord() throws Exception {
+ Metadata metadata = new Metadata();
+ Parser parser = new OfficeParser();
+
+ String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
+
+ assertTrue(xml.contains("test"));
+ }
+
+ private InputStream getTestDocument(String name) {
+ return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java Fri May 29 14:36:21 2015
@@ -65,12 +65,12 @@ public class OldExcelParserTest extends
// We can get the content type
assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
-
+
// But no other metadata
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals(null, metadata.get(Metadata.SUBJECT));
}
-
+
/**
* Check we can get the plain text properly
*/
@@ -85,9 +85,9 @@ public class OldExcelParserTest extends
} finally {
stream.close();
}
-
+
String text = handler.toString();
-
+
// Check we find a few words we expect in there
assertContains("Size", text);
assertContains("Returns", text);
@@ -104,15 +104,15 @@ public class OldExcelParserTest extends
public void testHTML() throws Exception {
XMLResult result = getXML(file);
String xml = result.xml;
-
+
// Sheet name not found - only 5+ have sheet names
assertNotContained("<p>Sheet 1</p>", xml);
-
+
// String cells
assertContains("<p>Table 10 -", xml);
assertContains("<p>Tax</p>", xml);
assertContains("<p>N/A</p>", xml);
-
+
// Number cells
assertContains("<p>(1)</p>", xml);
assertContains("<p>5.0</p>", xml);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri May 29 14:36:21 2015
@@ -21,15 +21,14 @@ import static org.junit.Assert.assertEqu
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -74,7 +73,7 @@ public class OutlookParserTest {
assertEquals(
"L'\u00C9quipe Microsoft Outlook Express",
metadata.get(Metadata.AUTHOR));
-
+
// Stored as Thu, 5 Apr 2007 09:26:06 -0700
assertEquals(
"2007-04-05T16:26:06Z",
@@ -118,7 +117,7 @@ public class OutlookParserTest {
}
/**
- * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+ * Test case for TIKA-395, to ensure parser works for new Outlook formats.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
*/
@@ -148,29 +147,29 @@ public class OutlookParserTest {
assertContains("Streamlined Mail Experience", content);
assertContains("Navigation Pane", content);
}
-
+
@Test
public void testOutlookHTMLVersion() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
-
+
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
+ SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_chinese.msg");
+ "/test-documents/testMSG_chinese.msg");
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, new ParseContext());
} finally {
- stream.close();
+ stream.close();
}
-
+
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString();
@@ -178,7 +177,7 @@ public class OutlookParserTest {
assertContains("<p>Alfresco MSG format testing", content);
assertContains("<li>1", content);
assertContains("<li>2", content);
-
+
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
@@ -188,39 +187,39 @@ public class OutlookParserTest {
public void testOutlookForwarded() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
-
+
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
+ SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_forwarded.msg");
+ "/test-documents/testMSG_forwarded.msg");
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, new ParseContext());
} finally {
- stream.close();
+ stream.close();
}
-
+
// Make sure we don't have nested docs
String content = sw.toString();
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
-
+
@Test
public void testOutlookHTMLfromRTF() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
-
+
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
+ SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -229,24 +228,24 @@ public class OutlookParserTest {
InputStream stream = OutlookParserTest.class.getResourceAsStream(
"/test-documents/test-outlook2003.msg");
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, new ParseContext());
} finally {
- stream.close();
+ stream.close();
}
-
+
// As the HTML version should have been processed, ensure
// we got some of the links
- String content = sw.toString().replaceAll("<p>\\s+","<p>");
+ String content = sw.toString().replaceAll("<p>\\s+", "<p>");
assertContains("<dd>New Outlook User</dd>", content);
assertContains("designed <i>to help you", content);
assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
-
+
// Link - check text around it, and the link itself
assertContains("sign up for a free subscription", content);
assertContains("Office Newsletter", content);
assertContains("newsletter will be sent to you", content);
assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
-
+
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);