You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/05/29 16:36:24 UTC
svn commit: r1682489 [13/14] - in /tika/trunk:
tika-parsers/src/main/java/org/apache/tika/parser/html/
tika-parsers/src/main/java/org/apache/tika/parser/image/
tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/
tika-parsers/src/main/java/org/...
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri May 29 14:36:21 2015
@@ -16,11 +16,13 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
-
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintStream;
@@ -49,9 +51,6 @@ import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
public class OOXMLParserTest extends TikaTest {
private Parser parser = new AutoDetectParser();
@@ -63,7 +62,7 @@ public class OOXMLParserTest extends Tik
@Test
public void testExcel() throws Exception {
- Metadata metadata = new Metadata();
+ Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
@@ -78,7 +77,7 @@ public class OOXMLParserTest extends Tik
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-
+
String content = handler.toString();
assertContains("Sample Excel Worksheet", content);
assertContains("Numbers and their Squares", content);
@@ -125,7 +124,7 @@ public class OOXMLParserTest extends Tik
// Percentage
assertContains("2.50%", content);
// Excel rounds up to 3%, but that requires Java 1.6 or later
- if(System.getProperty("java.version").startsWith("1.5")) {
+ if (System.getProperty("java.version").startsWith("1.5")) {
assertContains("2%", content);
} else {
assertContains("3%", content);
@@ -144,24 +143,24 @@ public class OOXMLParserTest extends Tik
// Fraction (2.5): # ?/?
assertContains("2 1/2", content);
-
+
// Below assertions represent outstanding formatting issues to be addressed
// they are included to allow the issues to be progressed with the Apache POI
// team - See TIKA-103.
/*************************************************************************
- // Date Format: m/d/yy
- assertContains("03/10/2009", content);
+ // Date Format: m/d/yy
+ assertContains("03/10/2009", content);
- // Date/Time Format
- assertContains("19/01/2008 04:35", content);
+ // Date/Time Format
+ assertContains("19/01/2008 04:35", content);
- // Custom Number (0 "dollars and" .00 "cents")
- assertContains("19 dollars and .99 cents", content);
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertContains("19 dollars and .99 cents", content);
- // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
- assertContains("At 4:20 AM on Thursday May 17, 2007", content);
- **************************************************************************/
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+ **************************************************************************/
} finally {
input.close();
}
@@ -170,7 +169,7 @@ public class OOXMLParserTest extends Tik
@Test
@Ignore("OOXML-Strict not currently supported by POI, see #57699")
public void testExcelStrict() throws Exception {
- Metadata metadata = new Metadata();
+ Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
@@ -185,7 +184,7 @@ public class OOXMLParserTest extends Tik
assertEquals("Sample Spreadsheet", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Spreadsheet for testing", metadata.get(TikaCoreProperties.DESCRIPTION));
-
+
String content = handler.toString();
assertContains("Test spreadsheet", content);
assertContains("This one is red", content);
@@ -201,17 +200,17 @@ public class OOXMLParserTest extends Tik
/**
* We have a number of different powerpoint files,
- * such as presentation, macro-enabled etc
+ * such as presentation, macro-enabled etc
*/
@Test
public void testPowerPoint() throws Exception {
- String[] extensions = new String[] {
- "pptx", "pptm", "ppsm", "ppsx", "potm"
- //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
- //"xps" // TIKA-418: Not yet supported by POI
- };
+ String[] extensions = new String[]{
+ "pptx", "pptm", "ppsm", "ppsx", "potm"
+ //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
+ //"xps" // TIKA-418: Not yet supported by POI
+ };
- String[] mimeTypes = new String[] {
+ String[] mimeTypes = new String[]{
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint.presentation.macroenabled.12",
"application/vnd.ms-powerpoint.slideshow.macroenabled.12",
@@ -219,7 +218,7 @@ public class OOXMLParserTest extends Tik
"application/vnd.ms-powerpoint.template.macroenabled.12"
};
- for (int i=0; i<extensions.length; i++) {
+ for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
@@ -227,11 +226,11 @@ public class OOXMLParserTest extends Tik
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
-
+
InputStream input = getTestDocument(filename);
try {
parser.parse(input, handler, metadata, context);
-
+
assertEquals(
"Mime-type checking for " + filename,
mimeTypes[i],
@@ -239,31 +238,31 @@ public class OOXMLParserTest extends Tik
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
-
+
String content = handler.toString();
// Theme files don't have the text in them
- if(extension.equals("thmx")) {
+ if (extension.equals("thmx")) {
assertEquals("", content);
} else {
assertTrue(
- "Text missing for " + filename + "\n" + content,
- content.contains("Attachment Test")
+ "Text missing for " + filename + "\n" + content,
+ content.contains("Attachment Test")
);
assertTrue(
- "Text missing for " + filename + "\n" + content,
- content.contains("This is a test file data with the same content")
+ "Text missing for " + filename + "\n" + content,
+ content.contains("This is a test file data with the same content")
);
assertTrue(
- "Text missing for " + filename + "\n" + content,
- content.contains("content parsing")
+ "Text missing for " + filename + "\n" + content,
+ content.contains("content parsing")
);
assertTrue(
- "Text missing for " + filename + "\n" + content,
- content.contains("Different words to test against")
+ "Text missing for " + filename + "\n" + content,
+ content.contains("Different words to test against")
);
assertTrue(
- "Text missing for " + filename + "\n" + content,
- content.contains("Mystery")
+ "Text missing for " + filename + "\n" + content,
+ content.contains("Mystery")
);
}
} finally {
@@ -271,20 +270,20 @@ public class OOXMLParserTest extends Tik
}
}
}
-
+
/**
* Test that the metadata is already extracted when the body is processed.
* See TIKA-1109
*/
@Test
public void testPowerPointMetadataEarly() throws Exception {
- String[] extensions = new String[] {
- "pptx", "pptm", "ppsm", "ppsx", "potm"
- //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
- //"xps" // TIKA-418: Not yet supported by POI
- };
+ String[] extensions = new String[]{
+ "pptx", "pptm", "ppsm", "ppsx", "potm"
+ //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
+ //"xps" // TIKA-418: Not yet supported by POI
+ };
- final String[] mimeTypes = new String[] {
+ final String[] mimeTypes = new String[]{
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint.presentation.macroenabled.12",
"application/vnd.ms-powerpoint.slideshow.macroenabled.12",
@@ -292,32 +291,30 @@ public class OOXMLParserTest extends Tik
"application/vnd.ms-powerpoint.template.macroenabled.12"
};
- for (int i=0; i<extensions.length; i++) {
+ for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
final String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
final Metadata metadata = new Metadata();
- // Allow the value to be access from the inner class
- final int currentI = i;
- ContentHandler handler = new BodyContentHandler()
- {
- public void startDocument ()
- {
- assertEquals(
- "Mime-type checking for " + filename,
- mimeTypes[currentI],
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+ // Allow the value to be access from the inner class
+ final int currentI = i;
+ ContentHandler handler = new BodyContentHandler() {
+ public void startDocument() {
+ assertEquals(
+ "Mime-type checking for " + filename,
+ mimeTypes[currentI],
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
- }
+ }
- };
+ };
ParseContext context = new ParseContext();
-
+
InputStream input = getTestDocument(filename);
try {
parser.parse(input, handler, metadata, context);
@@ -326,48 +323,49 @@ public class OOXMLParserTest extends Tik
}
}
}
-
+
/**
* For the PowerPoint formats we don't currently support, ensure that
- * we don't break either
+ * we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
- String[] extensions = new String[] { "xps", "thmx" };
- String[] mimeTypes = new String[] {
- "application/vnd.ms-xpsdocument",
- "application/vnd.openxmlformats-officedocument" // Is this right?
- };
-
- for (int i=0; i<extensions.length; i++) {
- String extension = extensions[i];
- String filename = "testPPT." + extension;
-
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
-
- InputStream input = getTestDocument(filename);
- try {
- parser.parse(input, handler, metadata, context);
-
- // Should get the metadata
- assertEquals(
- "Mime-type checking for " + filename,
- mimeTypes[i],
- metadata.get(Metadata.CONTENT_TYPE));
+ String[] extensions = new String[]{"xps", "thmx"};
+ String[] mimeTypes = new String[]{
+ "application/vnd.ms-xpsdocument",
+ "application/vnd.openxmlformats-officedocument" // Is this right?
+ };
+
+ for (int i = 0; i < extensions.length; i++) {
+ String extension = extensions[i];
+ String filename = "testPPT." + extension;
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ InputStream input = getTestDocument(filename);
+ try {
+ parser.parse(input, handler, metadata, context);
- // But that's about it
- } finally {
- input.close();
- }
- }
+ // Should get the metadata
+ assertEquals(
+ "Mime-type checking for " + filename,
+ mimeTypes[i],
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ // But that's about it
+ } finally {
+ input.close();
+ }
+ }
}
-
+
/**
* Test the plain text output of the Word converter
+ *
* @throws Exception
*/
@Test
@@ -393,6 +391,7 @@ public class OOXMLParserTest extends Tik
/**
* Test the plain text output of the Word converter
+ *
* @throws Exception
*/
@Test
@@ -415,69 +414,69 @@ public class OOXMLParserTest extends Tik
/**
* Test that the word converter is able to generate the
- * correct HTML for the document
+ * correct HTML for the document
*/
@Test
public void testWordHTML() throws Exception {
- XMLResult result = getXML("testWORD.docx");
- String xml = result.xml;
- Metadata metadata = result.metadata;
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- assertTrue(xml.contains("Sample Word Document"));
-
- // Check that custom headings came through
- assertTrue(xml.contains("<h1 class=\"title\">"));
- // Regular headings
- assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
- assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
- // Headings with anchor tags in them
- assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>"));
- // Bold and italic
- assertTrue(xml.contains("<b>BOLD</b>"));
- assertTrue(xml.contains("<i>ITALIC</i>"));
- // Table
- assertTrue(xml.contains("<table>"));
- assertTrue(xml.contains("<td>"));
- // Links
- assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
- // Anchor links
- assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
- // Paragraphs with other styles
- assertTrue(xml.contains("<p class=\"signature\">This one"));
-
- result = getXML("testWORD_3imgs.docx");
- xml = result.xml;
-
- // Images 2-4 (there is no 1!)
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
-
- // Text too
- assertTrue(xml.contains("<p>The end!</p>"));
-
- // TIKA-692: test document containing multiple
- // character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs.docx").xml;
-
- // Make sure bold text arrived as single
- // contiguous string even though Word parser
- // handled this as 3 character runs
- assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
-
- // TIKA-692: test document containing multiple
- // character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs2.docx").xml;
-
- // Make sure bold text arrived as single
- // contiguous string even though Word parser
- // handled this as 3 character runs
- assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ XMLResult result = getXML("testWORD.docx");
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(xml.contains("Sample Word Document"));
+
+ // Check that custom headings came through
+ assertTrue(xml.contains("<h1 class=\"title\">"));
+ // Regular headings
+ assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+ assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+ // Headings with anchor tags in them
+ assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>"));
+ // Bold and italic
+ assertTrue(xml.contains("<b>BOLD</b>"));
+ assertTrue(xml.contains("<i>ITALIC</i>"));
+ // Table
+ assertTrue(xml.contains("<table>"));
+ assertTrue(xml.contains("<td>"));
+ // Links
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Anchor links
+ assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+ // Paragraphs with other styles
+ assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+ result = getXML("testWORD_3imgs.docx");
+ xml = result.xml;
+
+ // Images 2-4 (there is no 1!)
+ assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
+ assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
+ assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
+
+ // Text too
+ assertTrue(xml.contains("<p>The end!</p>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs.docx").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs2.docx").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
/**
@@ -490,7 +489,7 @@ public class OOXMLParserTest extends Tik
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
+ SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -512,7 +511,7 @@ public class OOXMLParserTest extends Tik
}
/**
- * Documents with some sheets are protected, but not all.
+ * Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
@Test
@@ -539,7 +538,7 @@ public class OOXMLParserTest extends Tik
}
/**
- * An excel document which is password protected.
+ * An excel document which is password protected.
* See TIKA-437.
*/
@Test
@@ -559,7 +558,7 @@ public class OOXMLParserTest extends Tik
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
-
+
String content = handler.toString();
assertContains("Office", content);
} finally {
@@ -750,7 +749,7 @@ public class OOXMLParserTest extends Tik
/**
* TIKA-712 Master Slide Text from PPT and PPTX files
- * should be extracted too
+ * should be extracted too
*/
@Test
public void testMasterText() throws Exception {
@@ -807,151 +806,151 @@ public class OOXMLParserTest extends Tik
*/
@Test
public void testExcelCustomProperties() throws Exception {
- InputStream input = OOXMLParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_custom_props.xlsx");
- Metadata metadata = new Metadata();
-
- try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OOXMLParser().parse(input, handler, metadata, context);
- } finally {
- input.close();
- }
-
- assertEquals(
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(null, metadata.get(TikaCoreProperties.CREATOR));
- assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2006-09-12T15:06:44Z", metadata.get(Metadata.CREATION_DATE));
- assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED));
- assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE));
- assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME));
- assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL_custom_props.xlsx");
+ Metadata metadata = new Metadata();
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OOXMLParser().parse(input, handler, metadata, context);
+ } finally {
+ input.close();
+ }
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2006-09-12T15:06:44Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME));
+ assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
-
+
@Test
public void testWordCustomProperties() throws Exception {
- InputStream input = OOXMLParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_custom_props.docx");
- Metadata metadata = new Metadata();
-
- try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OOXMLParser().parse(input, handler, metadata, context);
- } finally {
- input.close();
- }
-
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
- assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
- assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
- assertEquals("Microsoft Office Word",metadata.get(Metadata.APPLICATION_NAME));
- assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("1", metadata.get(Office.PAGE_COUNT));
- assertEquals("2", metadata.get(Office.WORD_COUNT));
- assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
- assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
- assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
- // TODO: Remove subject in Tika 2.0
- assertEquals("My subject", metadata.get(Metadata.SUBJECT));
- assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_custom_props.docx");
+ Metadata metadata = new Metadata();
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OOXMLParser().parse(input, handler, metadata, context);
+ } finally {
+ input.close();
+ }
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("2", metadata.get(Office.WORD_COUNT));
+ assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
+ assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
-
+
@Test
public void testPowerPointCustomProperties() throws Exception {
- InputStream input = OOXMLParserTest.class.getResourceAsStream(
- "/test-documents/testPPT_custom_props.pptx");
- Metadata metadata = new Metadata();
-
- try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OOXMLParser().parse(input, handler, metadata, context);
- } finally {
- input.close();
- }
-
- assertEquals(
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
- assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
- assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
- assertEquals("1", metadata.get(Office.SLIDE_COUNT));
- assertEquals("3", metadata.get(Office.WORD_COUNT));
- assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT_custom_props.pptx");
+ Metadata metadata = new Metadata();
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OOXMLParser().parse(input, handler, metadata, context);
+ } finally {
+ input.close();
+ }
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
+ assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+ assertEquals("3", metadata.get(Office.WORD_COUNT));
+ assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
// TIKA-989:
@Test
public void testEmbeddedPDF() throws Exception {
- InputStream input = OOXMLParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_embedded_pdf.docx");
- Metadata metadata = new Metadata();
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
+ InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_embedded_pdf.docx");
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
-
- try {
- new OOXMLParser().parse(input, handler, metadata, new ParseContext());
- } finally {
- input.close();
- }
- String xml = sw.toString();
- int i = xml.indexOf("Here is the pdf file:");
- int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
- int k = xml.indexOf("Bye Bye");
- int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
- int m = xml.indexOf("Bye for real.");
- assertTrue(i != -1);
- assertTrue(j != -1);
- assertTrue(k != -1);
- assertTrue(l != -1);
- assertTrue(m != -1);
- assertTrue(i < j);
- assertTrue(j < k);
- assertTrue(k < l);
- assertTrue(l < m);
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ try {
+ new OOXMLParser().parse(input, handler, metadata, new ParseContext());
+ } finally {
+ input.close();
+ }
+ String xml = sw.toString();
+ int i = xml.indexOf("Here is the pdf file:");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+ int k = xml.indexOf("Bye Bye");
+ int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+ int m = xml.indexOf("Bye for real.");
+ assertTrue(i != -1);
+ assertTrue(j != -1);
+ assertTrue(k != -1);
+ assertTrue(l != -1);
+ assertTrue(m != -1);
+ assertTrue(i < j);
+ assertTrue(j < k);
+ assertTrue(k < l);
+ assertTrue(l < m);
}
// TIKA-997:
@@ -970,35 +969,35 @@ public class OOXMLParserTest extends Tik
assertTrue(i < j);
assertTrue(j < k);
}
-
+
// TIKA-1006
@Test
public void testWordNullStyle() throws Exception {
- String xml = getXML("testWORD_null_style.docx").xml;
- assertContains("Test av styrt dokument", xml);
+ String xml = getXML("testWORD_null_style.docx").xml;
+ assertContains("Test av styrt dokument", xml);
}
/**
* TIKA-1044 - Handle word documents where parts of the
- * text have no formatting or styles applied to them
+ * text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
- InputStream stream = WordParserTest.class.getResourceAsStream(
- "/test-documents/testWORD_no_format.docx");
- try {
- new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
- } finally {
- stream.close();
- }
+ InputStream stream = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_no_format.docx");
+ try {
+ new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
- String content = handler.toString();
- assertContains("This is a piece of text that causes an exception", content);
+ String content = handler.toString();
+ assertContains("This is a piece of text that causes an exception", content);
}
-
+
// TIKA-1005:
@Test
public void testTextInsideTextBox() throws Exception {
@@ -1013,12 +1012,12 @@ public class OOXMLParserTest extends Tik
@Test
public void testEmbeddedPPTXTwoSlides() throws Exception {
String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
- assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />" , xml);
- assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />" , xml);
+ assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />", xml);
+ assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />", xml);
}
-
+
/**
- * Test for missing text described in
+ * Test for missing text described in
* <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
* and TIKA-1317
*/
@@ -1045,37 +1044,37 @@ public class OOXMLParserTest extends Tik
//TIKA-1100:
@Test
public void testExcelTextBox() throws Exception {
- Metadata metadata = new Metadata();
+ Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
InputStream input = getTestDocument("testEXCEL_textbox.xlsx");
parser.parse(input, handler, metadata, context);
String content = handler.toString();
- assertContains("some autoshape", content);
- }
+ assertContains("some autoshape", content);
+ }
//TIKA-792; with room for future missing bean tests
@Test
- public void testWordMissingOOXMLBeans() throws Exception{
+ public void testWordMissingOOXMLBeans() throws Exception {
//If a bean is missing, POI prints stack trace to stderr
String[] fileNames = new String[]{
- "testWORD_missing_ooxml_bean1.docx",//TIKA-792
+ "testWORD_missing_ooxml_bean1.docx",//TIKA-792
};
PrintStream origErr = System.err;
- for (String fileName : fileNames){
- Metadata metadata = new Metadata();
+ for (String fileName : fileNames) {
+ Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
InputStream input = getTestDocument(fileName);
-
+
//grab stderr
ByteArrayOutputStream errContent = new ByteArrayOutputStream();
System.setErr(new PrintStream(errContent, true, IOUtils.UTF_8.name()));
parser.parse(input, handler, metadata, context);
-
+
//return stderr
System.setErr(origErr);
-
+
String err = errContent.toString(IOUtils.UTF_8.name());
assertTrue(err.length() == 0);
input.close();
@@ -1089,39 +1088,39 @@ public class OOXMLParserTest extends Tik
//not the auto-generated date.
XMLResult result = getXML("testPPT_autodate.pptx");
- assertContains("<p>Now</p>\n"+
- "<p>2011-12-19 10:20:04 AM</p>\n", result.xml);
-
+ assertContains("<p>Now</p>\n" +
+ "<p>2011-12-19 10:20:04 AM</p>\n", result.xml);
+
}
-
+
@Test
public void testDOCXThumbnail() throws Exception {
String xml = getXML("testDOCX_Thumbnail.docx").xml;
int a = xml.indexOf("This file contains a thumbnail");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
-
+
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
}
-
+
@Test
public void testXLSXThumbnail() throws Exception {
String xml = getXML("testXLSX_Thumbnail.xlsx").xml;
int a = xml.indexOf("This file contains an embedded thumbnail by default");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.wmf\" />");
-
+
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
}
-
+
@Test
public void testPPTXThumbnail() throws Exception {
String xml = getXML("testPPTX_Thumbnail.pptx").xml;
int a = xml.indexOf("<body><p>This file contains an embedded thumbnail</p>");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.jpeg\" />");
-
+
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
@@ -1171,7 +1170,7 @@ public class OOXMLParserTest extends Tik
parser.parse(is, handler, m, context);
} catch (EncryptedDocumentException ex) {
exc = true;
- } finally {
+ } finally {
is.close();
}
assertTrue(exc);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Fri May 29 14:36:21 2015
@@ -28,7 +28,7 @@ import org.junit.Test;
public class AccessCheckerTest {
@Test
- public void testLegacy() throws AccessPermissionException{
+ public void testLegacy() throws AccessPermissionException {
Metadata m = getMetadata(false, false);
//legacy behavior; don't bother checking
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri May 29 14:36:21 2015
@@ -59,12 +59,13 @@ import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+
/**
* Test case for parsing pdf files.
*/
public class PDFParserTest extends TikaTest {
- public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
+ public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
public static final MediaType TYPE_EMF = MediaType.application("x-emf");
public static final MediaType TYPE_PDF = MediaType.application("pdf");
public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
@@ -84,6 +85,21 @@ public class PDFParserTest extends TikaT
Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
}
+ private static int substringCount(String needle, String haystack) {
+ int upto = -1;
+ int count = 0;
+ while (true) {
+ final int next = haystack.indexOf(needle, upto);
+ if (next == -1) {
+ break;
+ }
+ count++;
+ upto = next + 1;
+ }
+
+ return count;
+ }
+
@Test
public void testPdfParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -99,8 +115,8 @@ public class PDFParserTest extends TikaT
assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
-
- // Can't reliably test dates yet - see TIKA-451
+
+ // Can't reliably test dates yet - see TIKA-451
// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
@@ -109,12 +125,12 @@ public class PDFParserTest extends TikaT
assertContains("incubator", content);
assertContains("Apache Software Foundation", content);
// testing how the end of one paragraph is separated from start of the next one
- assertTrue("should have word boundary after headline",
+ assertTrue("should have word boundary after headline",
!content.contains("ToolkitApache"));
- assertTrue("should have word boundary between paragraphs",
+ assertTrue("should have word boundary between paragraphs",
!content.contains("libraries.Apache"));
}
-
+
@Test
public void testPdfParsingMetadataOnly() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -149,80 +165,80 @@ public class PDFParserTest extends TikaT
assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Document author", metadata.get(Metadata.AUTHOR));
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
-
+
assertEquals("Custom Value", metadata.get("Custom Property"));
-
+
assertEquals("Array Entry 1", metadata.get("Custom Array"));
assertEquals(2, metadata.getValues("Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
-
+
assertContains("Hello World!", content);
}
-
+
/**
* PDFs can be "protected" with the default password. This means
- * they're encrypted (potentially both text and metadata),
- * but we can decrypt them easily.
+ * they're encrypted (potentially both text and metadata),
+ * but we can decrypt them easily.
*/
@Test
public void testProtectedPDF() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
- InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
-
- assertEquals("true", metadata.get("pdf:encrypted"));
- assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
- assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
- assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
-
- String content = handler.toString();
- assertContains("RETHINKING THE FINANCIAL NETWORK", content);
- assertContains("On 16 November 2002", content);
- assertContains("In many important respects", content);
-
-
- // Try again with an explicit empty password
- handler = new BodyContentHandler();
- metadata = new Metadata();
-
- context = new ParseContext();
- context.set(PasswordProvider.class, new PasswordProvider() {
- public String getPassword(Metadata metadata) {
- return "";
- }
- });
-
- stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- assertEquals("true", metadata.get("pdf:encrypted"));
-
- assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
- assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
-
- assertContains("RETHINKING THE FINANCIAL NETWORK", content);
- assertContains("On 16 November 2002", content);
- assertContains("In many important respects", content);
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_protected.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("true", metadata.get("pdf:encrypted"));
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+ assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+ assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
+
+ String content = handler.toString();
+ assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+ assertContains("On 16 November 2002", content);
+ assertContains("In many important respects", content);
+
+
+ // Try again with an explicit empty password
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+
+ context = new ParseContext();
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ public String getPassword(Metadata metadata) {
+ return "";
+ }
+ });
+
+ stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_protected.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ assertEquals("true", metadata.get("pdf:encrypted"));
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+ assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
+
+ assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+ assertContains("On 16 November 2002", content);
+ assertContains("In many important respects", content);
//now test wrong password
handler = new BodyContentHandler();
@@ -292,7 +308,7 @@ public class PDFParserTest extends TikaT
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDFTwoTextBoxes.pdf");
String content = getText(stream, parser);
- content = content.replaceAll("\\s+"," ");
+ content = content.replaceAll("\\s+", " ");
assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
}
@@ -367,7 +383,7 @@ public class PDFParserTest extends TikaT
Parser parser = new AutoDetectParser(); // Should auto-detect!
InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
String content = getText(stream, parser);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
assertContains("Here is some text", content);
assertContains("Here is a comment", content);
@@ -376,7 +392,7 @@ public class PDFParserTest extends TikaT
pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
content = getText(stream, pdfParser);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
@@ -387,15 +403,15 @@ public class PDFParserTest extends TikaT
context.set(PDFParserConfig.class, config);
stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
content = getText(stream, parser, context);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
-
-
+
+
// TIKA-738: make sure no extra </p> tags
String xml = getXML("testAnnotations.pdf").xml;
assertEquals(substringCount("<p>", xml),
- substringCount("</p>", xml));
+ substringCount("</p>", xml));
}
// TIKA-981
@@ -415,35 +431,20 @@ public class PDFParserTest extends TikaT
assertContains("PDF2", xml);
}
- private static int substringCount(String needle, String haystack) {
- int upto = -1;
- int count = 0;
- while(true) {
- final int next = haystack.indexOf(needle, upto);
- if (next == -1) {
- break;
- }
- count++;
- upto = next+1;
- }
-
- return count;
- }
-
@Test
public void testPageNumber() throws Exception {
final XMLResult result = getXML("testPageNumber.pdf");
- final String content = result.xml.replaceAll("\\s+","");
+ final String content = result.xml.replaceAll("\\s+", "");
assertContains("<p>1</p>", content);
}
/**
* Test to ensure that Links are extracted from the text
- *
+ * <p/>
* Note - the PDF contains the text "This is a hyperlink" which
- * a hyperlink annotation, linking to the tika site, on it. This
- * test will need updating when we're able to apply the annotation
- * to the text itself, rather than following on afterwards as now
+ * a hyperlink annotation, linking to the tika site, on it. This
+ * test will need updating when we're able to apply the annotation
+ * to the text itself, rather than following on afterwards as now
*/
@Test
public void testLinks() throws Exception {
@@ -457,19 +458,19 @@ public class PDFParserTest extends TikaT
parser.getPDFParserConfig().setEnableAutoSpace(false);
InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
String content = getText(stream, parser);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
// Text is correct when autoSpace is off:
assertContains("Here is some formatted text", content);
parser.getPDFParserConfig().setEnableAutoSpace(true);
stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
content = getText(stream, parser);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
// Text is correct when autoSpace is off:
// Text has extra spaces when autoSpace is on
assertEquals(-1, content.indexOf("Here is some formatted text"));
-
+
//now try with autodetect
Parser autoParser = new AutoDetectParser();
ParseContext context = new ParseContext();
@@ -478,18 +479,18 @@ public class PDFParserTest extends TikaT
//default is true
stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
content = getText(stream, autoParser, context);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
// Text has extra spaces when autoSpace is on
assertEquals(-1, content.indexOf("Here is some formatted text"));
config.setEnableAutoSpace(false);
-
+
stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
content = getText(stream, parser, context);
- content = content.replaceAll("[\\s\u00a0]+"," ");
+ content = content.replaceAll("[\\s\u00a0]+", " ");
// Text is correct when autoSpace is off:
assertContains("Here is some formatted text", content);
-
+
}
@Test
@@ -505,7 +506,7 @@ public class PDFParserTest extends TikaT
content = getText(stream, parser);
// "Text the first" was dedup'd:
assertContains("Text the first timesecond time", content);
-
+
//now try with autodetect
Parser autoParser = new AutoDetectParser();
ParseContext context = new ParseContext();
@@ -540,7 +541,7 @@ public class PDFParserTest extends TikaT
content = content.replaceAll("\\s+", " ");
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
-
+
//now try setting autodetect via parsecontext
AutoDetectParser autoParser = new AutoDetectParser();
ParseContext context = new ParseContext();
@@ -551,7 +552,7 @@ public class PDFParserTest extends TikaT
content = getText(stream, autoParser, context);
content = content.replaceAll("\\s+", " ");
assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
-
+
config.setSortByPosition(true);
context.set(PDFParserConfig.class, config);
stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
@@ -559,7 +560,7 @@ public class PDFParserTest extends TikaT
content = content.replaceAll("\\s+", " ");
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
-
+
}
// TIKA-1035
@@ -572,7 +573,7 @@ public class PDFParserTest extends TikaT
assertTrue(j != -1);
assertTrue(i < j);
}
-
+
//TIKA-1124
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
@@ -580,57 +581,57 @@ public class PDFParserTest extends TikaT
docx/
pdf/
docx
- */
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- String content = "";
- InputStream stream = null;
- try {
- context.set(org.apache.tika.parser.Parser.class, parser);
- stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
- parser.parse(stream, handler, metadata, context);
- content = handler.toString();
- } finally {
- stream.close();
- }
- int outerHaystack = content.indexOf("Outer_haystack");
- int pdfHaystack = content.indexOf("pdf_haystack");
- int needle = content.indexOf("Needle");
- assertTrue(outerHaystack > -1);
- assertTrue(pdfHaystack > -1);
- assertTrue(needle > -1);
- assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
-
- TrackingHandler tracker = new TrackingHandler();
- TikaInputStream tis;
- ContainerExtractor ex = new ParserContainerExtractor();
- try{
- tis= TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
- ex.extract(tis, ex, tracker);
- } finally {
- stream.close();
- }
- assertEquals(true, ex.isSupported(tis));
- assertEquals(3, tracker.filenames.size());
- assertEquals(3, tracker.mediaTypes.size());
- assertEquals("image1.emf", tracker.filenames.get(0));
- assertNull(tracker.filenames.get(1));
- assertEquals("Test.docx", tracker.filenames.get(2));
- assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
- assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
- assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
- }
+ */
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ String content = "";
+ InputStream stream = null;
+ try {
+ context.set(org.apache.tika.parser.Parser.class, parser);
+ stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
+ parser.parse(stream, handler, metadata, context);
+ content = handler.toString();
+ } finally {
+ stream.close();
+ }
+ int outerHaystack = content.indexOf("Outer_haystack");
+ int pdfHaystack = content.indexOf("pdf_haystack");
+ int needle = content.indexOf("Needle");
+ assertTrue(outerHaystack > -1);
+ assertTrue(pdfHaystack > -1);
+ assertTrue(needle > -1);
+ assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+ TrackingHandler tracker = new TrackingHandler();
+ TikaInputStream tis;
+ ContainerExtractor ex = new ParserContainerExtractor();
+ try {
+ tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
+ ex.extract(tis, ex, tracker);
+ } finally {
+ stream.close();
+ }
+ assertEquals(true, ex.isSupported(tis));
+ assertEquals(3, tracker.filenames.size());
+ assertEquals(3, tracker.mediaTypes.size());
+ assertEquals("image1.emf", tracker.filenames.get(0));
+ assertNull(tracker.filenames.get(1));
+ assertEquals("Test.docx", tracker.filenames.get(2));
+ assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+ assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+ assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+ }
/**
* tests for equality between traditional sequential parser
* and newer nonsequential parser.
- *
+ * <p/>
* TODO: more testing
*/
@Test
- public void testSequentialParser() throws Exception{
+ public void testSequentialParser() throws Exception {
Parser sequentialParser = new AutoDetectParser();
Parser nonSequentialParser = new AutoDetectParser();
@@ -659,14 +660,14 @@ public class PDFParserTest extends TikaT
Set<String> knownContentDiffs = new HashSet<String>();
for (File f : testDocs.listFiles()) {
- if (! f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
+ if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
continue;
}
String sequentialContent = null;
Metadata sequentialMetadata = new Metadata();
try {
- sequentialContent = getText(new FileInputStream(f),
+ sequentialContent = getText(new FileInputStream(f),
sequentialParser, seqContext, sequentialMetadata);
} catch (EncryptedDocumentException e) {
//silently skip a file that requires a user password
@@ -680,8 +681,8 @@ public class PDFParserTest extends TikaT
String nonSequentialContent = null;
Metadata nonSequentialMetadata = new Metadata();
try {
- nonSequentialContent = getText(new FileInputStream(f),
- nonSequentialParser, nonSeqContext, nonSequentialMetadata);
+ nonSequentialContent = getText(new FileInputStream(f),
+ nonSequentialParser, nonSeqContext, nonSequentialMetadata);
} catch (Exception e) {
throw new TikaException("Non-Sequential Parser failed on test file " + f, e);
}
@@ -782,7 +783,7 @@ public class PDFParserTest extends TikaT
context.set(org.apache.tika.parser.Parser.class, p);
try {
- tis= TikaInputStream.get(
+ tis = TikaInputStream.get(
getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"));
p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
} finally {
@@ -821,13 +822,13 @@ public class PDFParserTest extends TikaT
Metadata m = new Metadata();
ParseContext c = new ParseContext();
ContentHandler h = new EventCountingHandler();
- p.parse(is, h, m, c);
- assertEquals(1, ((EventCountingHandler)h).getEndDocument());
+ p.parse(is, h, m, c);
+ assertEquals(1, ((EventCountingHandler) h).getEndDocument());
}
@Test
public void testVersions() throws Exception {
-
+
Map<String, String> dcFormat = new HashMap<String, String>();
dcFormat.put("4.x", "application/pdf; version=1.3");
dcFormat.put("5.x", "application/pdf; version=1.4");
@@ -847,7 +848,7 @@ public class PDFParserTest extends TikaT
pdfVersions.put("9.x", "1.7");
pdfVersions.put("10.x", "1.7");
pdfVersions.put("11.x.PDFA-1b", "1.7");
-
+
Map<String, String> pdfExtensionVersions = new HashMap<String, String>();
pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
@@ -855,9 +856,9 @@ public class PDFParserTest extends TikaT
Parser p = new AutoDetectParser();
for (Map.Entry<String, String> e : dcFormat.entrySet()) {
- String fName = "testPDF_Version."+e.getKey()+".pdf";
+ String fName = "testPDF_Version." + e.getKey() + ".pdf";
InputStream is = PDFParserTest.class.getResourceAsStream(
- "/test-documents/"+fName);
+ "/test-documents/" + fName);
Metadata m = new Metadata();
ContentHandler h = new BodyContentHandler();
ParseContext c = new ParseContext();
@@ -873,8 +874,8 @@ public class PDFParserTest extends TikaT
assertTrue("dc:format ::" + e.getValue(), foundDC);
String extensionVersionTruth = pdfExtensionVersions.get(e.getKey());
if (extensionVersionTruth != null) {
- assertEquals("pdf:PDFExtensionVersion :: "+extensionVersionTruth,
- extensionVersionTruth,
+ assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth,
+ extensionVersionTruth,
m.get("pdf:PDFExtensionVersion"));
}
assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()),
@@ -883,7 +884,7 @@ public class PDFParserTest extends TikaT
//now test full 11.x
String fName = "testPDF_Version.11.x.PDFA-1b.pdf";
InputStream is = PDFParserTest.class.getResourceAsStream(
- "/test-documents/"+fName);
+ "/test-documents/" + fName);
Metadata m = new Metadata();
ParseContext c = new ParseContext();
ContentHandler h = new BodyContentHandler();
@@ -893,14 +894,14 @@ public class PDFParserTest extends TikaT
for (String fmt : m.getValues("dc:format")) {
versions.add(fmt);
}
-
- for (String hit : new String[]{ "application/pdf; version=1.7",
- "application/pdf; version=\"A-1b\"",
- "application/pdf; version=\"1.7 Adobe Extension Level 8\""
+
+ for (String hit : new String[]{"application/pdf; version=1.7",
+ "application/pdf; version=\"A-1b\"",
+ "application/pdf; version=\"1.7 Adobe Extension Level 8\""
}) {
assertTrue(hit, versions.contains(hit));
}
-
+
assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B");
assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
}
@@ -909,15 +910,15 @@ public class PDFParserTest extends TikaT
public void testMultipleAuthors() throws Exception {
String fName = "testPDF_twoAuthors.pdf";
InputStream is = PDFParserTest.class.getResourceAsStream(
- "/test-documents/"+fName);
+ "/test-documents/" + fName);
Parser p = new AutoDetectParser();
Metadata m = new Metadata();
ParseContext c = new ParseContext();
ContentHandler h = new BodyContentHandler();
p.parse(is, h, m, c);
is.close();
-
- String[] keys = new String[] {
+
+ String[] keys = new String[]{
"dc:creator",
"meta:author",
"creator",
@@ -926,7 +927,7 @@ public class PDFParserTest extends TikaT
for (String k : keys) {
String[] vals = m.getValues(k);
- assertEquals("number of authors == 2 for key: "+ k, 2, vals.length);
+ assertEquals("number of authors == 2 for key: " + k, 2, vals.length);
Set<String> set = new HashSet<String>();
set.add(vals[0]);
set.add(vals[1]);
@@ -955,7 +956,7 @@ public class PDFParserTest extends TikaT
@Test
public void testInlineSelector() throws Exception {
-
+
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
@@ -980,9 +981,9 @@ public class PDFParserTest extends TikaT
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
- if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
+ if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
- } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
+ } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
@@ -1007,9 +1008,9 @@ public class PDFParserTest extends TikaT
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
- if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
+ if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
- } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
+ } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
@@ -1022,7 +1023,7 @@ public class PDFParserTest extends TikaT
@Test
public void testInlineConfig() throws Exception {
-
+
Parser defaultParser = new AutoDetectParser();
RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
@@ -1041,9 +1042,9 @@ public class PDFParserTest extends TikaT
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
- if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
+ if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
- } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
+ } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
@@ -1071,9 +1072,9 @@ public class PDFParserTest extends TikaT
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
- if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
+ if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
- } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
+ } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
@@ -1125,7 +1126,7 @@ public class PDFParserTest extends TikaT
assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
-
+
}
@Test //TIKA-1427
@@ -1167,7 +1168,7 @@ public class PDFParserTest extends TikaT
@Test
public void testLegacyAccessChecking() throws Exception {
//test that default behavior doesn't throw AccessPermissionException
- for (String file : new String[] {
+ for (String file : new String[]{
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf",
}) {
@@ -1187,13 +1188,13 @@ public class PDFParserTest extends TikaT
context.set(PasswordProvider.class, provider);
Parser parser = new AutoDetectParser();
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",
}) {
InputStream stream = null;
try {
- stream = TikaInputStream.get(this.getClass().getResource("/test-documents/"+path));
+ stream = TikaInputStream.get(this.getClass().getResource("/test-documents/" + path));
String text = getText(stream, parser, context);
assertContains("Hello World", text);
} finally {
@@ -1213,11 +1214,11 @@ public class PDFParserTest extends TikaT
context.set(PDFParserConfig.class, config);
//test exception for empty password
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf",
}) {
- assertException("/test-documents/"+path, parser, context, AccessPermissionException.class);
+ assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
}
config.setAccessChecker(new AccessChecker(true));
@@ -1226,7 +1227,7 @@ public class PDFParserTest extends TikaT
InputStream is = null;
try {
- is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf");
+ is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_empty.pdf");
assertContains("Hello World", getText(is, parser, context));
} finally {
IOUtils.closeQuietly(is);
@@ -1253,41 +1254,41 @@ public class PDFParserTest extends TikaT
Parser parser = new AutoDetectParser();
//test bad passwords
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf",
}) {
- assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class);
+ assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
}
//bad password is still a bad password
config.setAccessChecker(new AccessChecker(true));
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf",
}) {
- assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class);
+ assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
}
//now test documents that require this "user" password
- assertException("/test-documents/"+"testPDF_no_extract_no_accessibility_owner_user.pdf",
+ assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf",
parser, context, AccessPermissionException.class);
InputStream is = null;
try {
- is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_user.pdf");
+ is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf");
assertContains("Hello World", getText(is, parser, context));
} finally {
IOUtils.closeQuietly(is);
}
config.setAccessChecker(new AccessChecker(false));
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",
}) {
- assertException("/test-documents/"+path, parser, context, AccessPermissionException.class);
+ assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
}
}
@@ -1310,7 +1311,7 @@ public class PDFParserTest extends TikaT
Parser parser = new AutoDetectParser();
//with owner's password, text can be extracted, no matter the AccessibilityChecker's settings
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
@@ -1328,7 +1329,7 @@ public class PDFParserTest extends TikaT
//really, with owner's password, all extraction is allowed
config.setAccessChecker(new AccessChecker(false));
- for (String path : new String[] {
+ for (String path : new String[]{
"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
@@ -1352,26 +1353,25 @@ public class PDFParserTest extends TikaT
String text = getText(is, parser, context);
noEx = true;
} catch (Exception e) {
- assertEquals("Not the right exception: "+path, expected, e.getClass());
+ assertEquals("Not the right exception: " + path, expected, e.getClass());
} finally {
IOUtils.closeQuietly(is);
}
assertFalse(path + " should have thrown exception", noEx);
}
+
/**
- *
* Simple class to count end of document events. If functionality is useful,
* move to org.apache.tika in src/test
- *
*/
private class EventCountingHandler extends ContentHandlerDecorator {
private int endDocument = 0;
-
+
@Override
public void endDocument() {
endDocument++;
}
-
+
public int getEndDocument() {
return endDocument;
}
@@ -1382,7 +1382,7 @@ public class PDFParserTest extends TikaT
@Override
public boolean select(Metadata metadata) {
String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
- if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
+ if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
return false;
}
return true;