You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bd...@apache.org on 2007/09/15 11:33:08 UTC
svn commit: r575896 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/config/ src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/utils/ src/test/java/org/apache/tika/
src/test/java/org/apache/tika/utils/ src/test/resources/ ...
Author: bdelacretaz
Date: Sat Sep 15 02:33:07 2007
New Revision: 575896
URL: http://svn.apache.org/viewvc?rev=575896&view=rev
Log:
TIKA-19: fix org.apache.tika.TestParsers, test more file types and improve exception handling in LiusConfig and ParserFactory. Includes fixes from TIKA-16 and TIKA-14 which were contributed by Keith R. Bennett, thanks!
Added:
incubator/tika/trunk/src/test/resources/test-documents/
incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls (with props)
incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
- copied unchanged from r575888, incubator/tika/trunk/src/test/resources/testHTML.html
incubator/tika/trunk/src/test/resources/test-documents/testOpenOffice2.odt (with props)
incubator/tika/trunk/src/test/resources/test-documents/testPDF.pdf (with props)
incubator/tika/trunk/src/test/resources/test-documents/testPPT.ppt (with props)
incubator/tika/trunk/src/test/resources/test-documents/testRTF.rtf
- copied unchanged from r575888, incubator/tika/trunk/src/test/resources/testRTF.rtf
incubator/tika/trunk/src/test/resources/test-documents/testTXT.txt
- copied unchanged from r575888, incubator/tika/trunk/src/test/resources/testTXT.txt
incubator/tika/trunk/src/test/resources/test-documents/testWORD.doc (with props)
incubator/tika/trunk/src/test/resources/test-documents/testXML.xml
- copied, changed from r575888, incubator/tika/trunk/src/test/resources/testXML.xml
Removed:
incubator/tika/trunk/src/test/resources/testHTML.html
incubator/tika/trunk/src/test/resources/testRTF.rtf
incubator/tika/trunk/src/test/resources/testTXT.txt
incubator/tika/trunk/src/test/resources/testXML.xml
Modified:
incubator/tika/trunk/ (props changed)
incubator/tika/trunk/src/main/java/org/apache/tika/config/LiusConfig.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
incubator/tika/trunk/src/test/java/org/apache/tika/utils/MimeTypesUtilsTest.java
Propchange: incubator/tika/trunk/
------------------------------------------------------------------------------
--- svn:ignore (original)
+++ svn:ignore Sat Sep 15 02:33:07 2007
@@ -2,3 +2,4 @@
.project
.settings
.classpath
+lius.log
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/config/LiusConfig.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/LiusConfig.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/LiusConfig.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/LiusConfig.java Sat Sep 15 02:33:07 2007
@@ -49,7 +49,7 @@
private static String currentFile;
- public static LiusConfig getInstance(String configFile) {
+ public static LiusConfig getInstance(String configFile) throws JDOMException,IOException {
if (configsCache.containsKey(configFile)) {
return (LiusConfig) configsCache.get(configFile);
@@ -86,15 +86,17 @@
return pc;
}
- private static Document parse(String file) {
+ private static Document parse(String file) throws JDOMException,IOException {
org.jdom.Document xmlDoc = new org.jdom.Document();
try {
SAXBuilder builder = new SAXBuilder();
xmlDoc = builder.build(new File(file));
- } catch (JDOMException e) {
- logger.error(e.getMessage());
- } catch (IOException e) {
- logger.error(e.getMessage());
+ } catch (JDOMException jde) {
+ logger.error(jde.getMessage(),jde);
+ throw jde;
+ } catch(IOException ioe) {
+ logger.error(ioe.getMessage(),ioe);
+ throw ioe;
}
return xmlDoc;
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java Sat Sep 15 02:33:07 2007
@@ -26,6 +26,7 @@
import org.apache.tika.utils.MimeTypesUtils;
import org.apache.log4j.Logger;
+import org.jdom.JDOMException;
/**
* Factory class. Build parser from xml config file.
@@ -41,15 +42,27 @@
*/
public static Parser getParser(File file, LiusConfig tc)
throws IOException, LiusException {
+ if(!file.canRead()) {
+ throw new IOException("Cannot read input file " + file.getAbsoluteFile());
+ }
String mimeType = MimeTypesUtils.getMimeType(file);
ParserConfig pc = tc.getParserConfig(mimeType);
+ if(pc==null) {
+ throw new LiusException(
+ "No ParserConfig available for mime-type '" + mimeType + "'"
+ + " for file " + file.getName()
+ );
+ }
String className = pc.getParserClass();
Parser parser = null;
Class<?> parserClass = null;
if (className != null) {
try {
- logger.info("Loading parser class = " + className
- + " MimeType = " + mimeType);
+ logger.debug(
+ "Loading parser class = " + className
+ + " MimeType = " + mimeType
+ + " for file " + file.getName()
+ );
parserClass = Class.forName(className);
parser = (Parser) parserClass.newInstance();
@@ -83,7 +96,7 @@
* Build parser from string file path and Lius config file path
*/
public static Parser getParser(String str, String tcPath)
- throws IOException, LiusException {
+ throws IOException, LiusException, JDOMException {
LiusConfig tc = LiusConfig.getInstance(tcPath);
return getParser(new File(str), tc);
}
@@ -92,7 +105,7 @@
* Build parser from file and Lius config file path
*/
public static Parser getParser(File file, String tcPath)
- throws IOException, LiusException {
+ throws IOException, LiusException, JDOMException {
LiusConfig tc = LiusConfig.getInstance(tcPath);
return getParser(file, tc);
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MimeTypesUtils.java Sat Sep 15 02:33:07 2007
@@ -54,7 +54,11 @@
return "application/vnd.ms-excel";
} else if (name.endsWith(".zip")) {
return "application/zip";
- } else {
+ } else if (name.endsWith(".rtf")) {
+ return "application/rtf";
+ } else if (name.endsWith(".odt")) {
+ return "application/vnd.oasis.opendocument.text";
+ } else {
return "application/octet-stream";
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Sat Sep 15 02:33:07 2007
@@ -24,6 +24,10 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
@@ -31,9 +35,8 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
-import org.apache.tika.config.Content;
-
import org.apache.log4j.Logger;
+import org.apache.tika.config.Content;
import org.jdom.Document;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
@@ -50,26 +53,36 @@
static Logger logger = Logger.getRootLogger();
+ public static String toString(Collection<Content> structuredContent) {
+ final StringWriter sw = new StringWriter();
+ print(structuredContent,sw);
+ return sw.toString();
+ }
+
public static void print(Collection<Content> structuredContent) {
+ print(structuredContent,new OutputStreamWriter(System.out));
+ }
+
+ public static void print(Collection<Content> structuredContent,Writer outputWriter) {
+ final PrintWriter output = new PrintWriter(outputWriter,true);
for (Iterator<Content> iter = structuredContent.iterator(); iter
.hasNext();) {
Content ct = iter.next();
if (ct.getValue() != null) {
- System.out.print(ct.getName() + ": ");
- System.out.println(ct.getValue());
+ output.print(ct.getName() + ": ");
+ output.println(ct.getValue());
} else if (ct.getValues() != null) {
- System.out.print(ct.getName() + ": ");
+ output.print(ct.getName() + ": ");
for (int j = 0; j < ct.getValues().length; j++) {
if (j == 0)
- System.out.println(ct.getValues()[j]);
+ output.println(ct.getValues()[j]);
else {
- System.out.println("\t" + ct.getValues()[j]);
+ output.println("\t" + ct.getValues()[j]);
}
}
}
}
-
}
public static Document parse(InputStream is) {
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Sat Sep 15 02:33:07 2007
@@ -22,13 +22,14 @@
import java.util.StringTokenizer;
import junit.framework.TestCase;
+
import org.apache.tika.config.Content;
import org.apache.tika.config.LiusConfig;
-import org.apache.tika.exception.LiusException;
import org.apache.tika.log.LiusLogger;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserFactory;
import org.apache.tika.utils.Utils;
+import org.jdom.JDOMException;
/**
* Junit test class
@@ -37,12 +38,13 @@
public class TestParsers extends TestCase {
private LiusConfig tc;
+ private File testFilesBaseDir;
- private File classDir;
-
- private String config;
-
- public void setUp() {
+ public void setUp() throws JDOMException, IOException {
+ /* FIXME the old mechanism does not work anymore when running the tests
+ * with Maven - need a resource-based one, but this means more
+ * changes to classes which rely on filenames.
+ *
String sep = File.separator;
StringTokenizer st = new StringTokenizer(System.getProperty(
"java.class.path"), File.pathSeparator);
@@ -53,13 +55,19 @@
String log4j = classDir.getParent() + sep + "Config" + sep + "log4j"
+ sep + "log4j.properties";
+ */
- tc = LiusConfig.getInstance(config);
+ // FIXME for now, fix filenames according to Maven testing layout
+ final String liusConfigFilename = "target/classes/config.xml";
+ final String log4jPropertiesFilename = "target/classes/log4j/log4j.properties";
+ testFilesBaseDir = new File("src/test/resources/test-documents");
+
+ tc = LiusConfig.getInstance(liusConfigFilename);
- LiusLogger.setLoggerConfigFile(log4j);
+ LiusLogger.setLoggerConfigFile(log4jPropertiesFilename);
}
-
+
/*
* public void testConfig(){ TikaConfig tc =
* TikaConfig.getInstance("C:\\tika\\config\\tikaConfig2.xml"); ParserConfig
@@ -67,149 +75,50 @@
* pc.getName()); }
*/
- public void testPDFExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testPDF.PDF");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testTXTExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testTXT.txt");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testRTFExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testRTF.rtf");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testXMLExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testXML.xml");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testPPTExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testPPT.ppt");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- System.out.println(parser.getStrContent());
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testWORDxtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testWORD.doc");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- System.out.println(parser.getStrContent());
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testEXCELExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testEXCEL.xls");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- // System.out.println(parser.getStrContent());
- printContentsInfo(parser);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
- }
-
- public void testOOExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testOO2.odt");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- // System.out.println(parser.getStrContent());
- printContentsInfo(parser);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
-
+ public void testPDFExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testPDF.pdf"), tc);
+ }
+
+ public void testTXTExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testTXT.txt"), tc);
+ }
+
+ public void testRTFExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testRTF.rtf"), tc);
}
- public void testHTMLExtraction() {
- Parser parser = null;
- File testFile = new File(classDir.getParent() + File.separator
- + "testFiles" + File.separator + "testHTML.html");
- try {
- parser = ParserFactory.getParser(testFile, tc);
- assertEquals("Title : Test Indexation Html", (parser.getContent("title")).getValue());
- // System.out.println(parser.getStrContent());
- printContentsInfo(parser);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (LiusException e) {
- e.printStackTrace();
- }
+ public void testXMLExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testXML.xml"), tc);
+ }
+
+ public void testPPTExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testPPT.ppt"), tc);
+ }
+
+ public void testWORDxtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testWORD.doc"), tc);
+ }
+ public void testEXCELExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testEXCEL.xls"), tc);
+ }
+
+ public void testOOExtraction() throws Exception {
+ ParserFactory.getParser(getTestFile("testOpenOffice2.odt"), tc);
+ }
+
+ public void testHTMLExtraction() throws Exception {
+ Parser parser = ParserFactory.getParser(getTestFile("testHTML.html"), tc);
+ assertEquals("Title : Test Indexation Html", (parser.getContent("title")).getValue());
+ assertEquals("text/html",parser.getMimeType());
+ final String text = Utils.toString(parser.getContents());
+
+ final String expected = "Test Indexation Html";
+ assertTrue("text contains '" + expected + "'",text.indexOf(expected) >= 0);
}
- private void printContentsInfo(Parser parser) {
- String mimeType = parser.getMimeType();
- System.out.println("Mime : " + mimeType);
- String strContent = parser.getStrContent();
- Collection<Content> structuredContent = parser.getContents();
- Utils.print(structuredContent);
- System.out.println("==============");
- // Content title = parser.getContent("title");
+ private File getTestFile(String filename) {
+ return new File(testFilesBaseDir,filename);
}
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/utils/MimeTypesUtilsTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/utils/MimeTypesUtilsTest.java?rev=575896&r1=575895&r2=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/utils/MimeTypesUtilsTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/utils/MimeTypesUtilsTest.java Sat Sep 15 02:33:07 2007
@@ -25,13 +25,22 @@
public class MimeTypesUtilsTest extends TestCase {
public void test() throws MalformedURLException {
- String s = "x.pdf";
URL u = new URL("http://mydomain.com/x.pdf?x=y");
- File f = new File( "/a/b/c/x.pdf");
+ File f = new File("/a/b/c/x.pdf");
- assertEquals("application/pdf", MimeTypesUtils.getMimeType(s));
- assertEquals("application/pdf", MimeTypesUtils.getMimeType(u));
- assertEquals("application/pdf", MimeTypesUtils.getMimeType(f));
+ assertEquals("application/pdf",MimeTypesUtils.getMimeType("x.pdf"));
+ assertEquals("application/pdf",MimeTypesUtils.getMimeType(u));
+ assertEquals("application/pdf",MimeTypesUtils.getMimeType(f));
+ assertEquals("text/plain",MimeTypesUtils.getMimeType("x.txt"));
+ assertEquals("text/html",MimeTypesUtils.getMimeType("x.htm"));
+ assertEquals("text/html",MimeTypesUtils.getMimeType("x.html"));
+ assertEquals("application/xhtml+xml",MimeTypesUtils.getMimeType("x.xhtml"));
+ assertEquals("application/xml",MimeTypesUtils.getMimeType("x.xml"));
+ assertEquals("application/msword",MimeTypesUtils.getMimeType("x.doc"));
+ assertEquals("application/vnd.ms-powerpoint",MimeTypesUtils.getMimeType("x.ppt"));
+ assertEquals("application/vnd.ms-excel",MimeTypesUtils.getMimeType("x.xls"));
+ assertEquals("application/zip",MimeTypesUtils.getMimeType("x.zip"));
+ assertEquals("application/vnd.oasis.opendocument.text",MimeTypesUtils.getMimeType("x.odt"));
+ assertEquals("application/octet-stream",MimeTypesUtils.getMimeType("x.xyz"));
}
-
}
Added: incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls?rev=575896&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testEXCEL.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testOpenOffice2.odt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testOpenOffice2.odt?rev=575896&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testOpenOffice2.odt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testPDF.pdf
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testPDF.pdf?rev=575896&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testPDF.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf
Added: incubator/tika/trunk/src/test/resources/test-documents/testPPT.ppt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testPPT.ppt?rev=575896&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testPPT.ppt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testWORD.doc
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testWORD.doc?rev=575896&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testWORD.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword
Copied: incubator/tika/trunk/src/test/resources/test-documents/testXML.xml (from r575888, incubator/tika/trunk/src/test/resources/testXML.xml)
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXML.xml?p2=incubator/tika/trunk/src/test/resources/test-documents/testXML.xml&p1=incubator/tika/trunk/src/test/resources/testXML.xml&r1=575888&r2=575896&rev=575896&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/testXML.xml (original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXML.xml Sat Sep 15 02:33:07 2007
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
<oaidc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/">
<dc:title>Archimède et Lius</dc:title>